Merge branch 'heikki-kvstore' into bojan-psbench-over-kvstore

2026-05-16 20:50:37 +00:00 · 2022-03-18 16:27:18 -04:00
parent 098d7046f8 a39de2997f
commit 21f9774ea4
76 changed files with 8554 additions and 5477 deletions
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -119,7 +119,7 @@
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
      tags:
      - pageserver

@@ -169,6 +169,6 @@
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
      tags:
      - safekeeper
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -61,6 +61,58 @@ dependencies = [
 "backtrace",
 ]

+[[package]]
+name = "arrayvec"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
+
+[[package]]
+name = "askama"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134"
+dependencies = [
+ "askama_derive",
+ "askama_escape",
+ "askama_shared",
+]
+
+[[package]]
+name = "askama_derive"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522"
+dependencies = [
+ "askama_shared",
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "askama_escape"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
+
+[[package]]
+name = "askama_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d6083ccb191711e9c2b80b22ee24a8381a18524444914c746d4239e21d1afaf"
+dependencies = [
+ "askama_escape",
+ "humansize",
+ "nom 6.1.2",
+ "num-traits",
+ "percent-encoding",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn",
+ "toml",
+]
+
 [[package]]
 name = "async-compression"
 version = "0.3.12"
@@ -234,6 +286,18 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

+[[package]]
+name = "bitvec"
+version = "0.19.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55f93d0ef3363c364d5976646a38f04cf67cfe1d4c8d160cdea02cab2c116b33"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
 [[package]]
 name = "block-buffer"
 version = "0.9.0"
@@ -260,6 +324,18 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"

+[[package]]
+name = "bstr"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.9.1"
@@ -281,6 +357,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "cast"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
+dependencies = [
+ "rustc_version",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.72"
@@ -296,7 +381,7 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
 dependencies = [
- "nom",
+ "nom 7.1.0",
 ]

 [[package]]
@@ -447,6 +532,76 @@ dependencies = [
 "rustc_version",
 ]

+[[package]]
+name = "criterion"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
+dependencies = [
+ "atty",
+ "cast",
+ "clap 2.34.0",
+ "criterion-plot",
+ "csv",
+ "itertools",
+ "lazy_static",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_cbor",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
+dependencies = [
+ "cfg-if",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "lazy_static",
+ "memoffset",
+ "scopeguard",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.7"
@@ -477,6 +632,28 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "csv"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa 0.4.8",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "daemonize"
 version = "0.4.1"
@@ -617,6 +794,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "funty"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
+
 [[package]]
 name = "futures"
 version = "0.3.21"
@@ -882,6 +1065,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421"

+[[package]]
+name = "humansize"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026"
+
 [[package]]
 name = "humantime"
 version = "2.1.0"
@@ -1035,6 +1224,19 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

+[[package]]
+name = "lexical-core"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
+dependencies = [
+ "arrayvec",
+ "bitflags",
+ "cfg-if",
+ "ryu",
+ "static_assertions",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.117"
@@ -1051,6 +1253,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "libm"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33a33a362ce288760ec6a508b94caaec573ae7d3bbbd91b87aa0bad4456839db"
+
 [[package]]
 name = "lock_api"
 version = "0.4.6"
@@ -1185,6 +1393,19 @@ dependencies = [
 "memoffset",
 ]

+[[package]]
+name = "nom"
+version = "6.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7413f999671bd4745a7b624bd370a569fb6bc574b23c83a3c5ed2e453f3d5e2"
+dependencies = [
+ "bitvec",
+ "funty",
+ "lexical-core",
+ "memchr",
+ "version_check",
+]
+
 [[package]]
 name = "nom"
 version = "7.1.0"
@@ -1233,6 +1454,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
 dependencies = [
 "autocfg",
+ "libm",
 ]

 [[package]]
@@ -1260,6 +1482,12 @@ version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"

+[[package]]
+name = "oorandom"
+version = "11.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
+
 [[package]]
 name = "opaque-debug"
 version = "0.3.0"
@@ -1307,10 +1535,12 @@ dependencies = [
 "hex-literal",
 "humantime",
 "hyper",
+ "itertools",
 "lazy_static",
 "log",
 "nix",
 "once_cell",
+ "plotly",
 "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
 "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
 "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
@@ -1444,6 +1674,47 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

+[[package]]
+name = "plotly"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26f011c8acfc15cfa062397000d894fba55bf3d1eca4301b07d6e82237b06e55"
+dependencies = [
+ "askama",
+ "rand",
+ "rand_distr",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "plotters"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9"
+dependencies = [
+ "plotters-backend",
+]
+
 [[package]]
 name = "postgres"
 version = "0.19.1"
@@ -1592,6 +1863,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "clap 3.0.14",
+ "fail",
 "futures",
 "hashbrown 0.11.2",
 "hex",
@@ -1607,6 +1879,7 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "thiserror",
 "tokio",
 "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
 "tokio-postgres-rustls",
@@ -1624,6 +1897,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "radium"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
+
 [[package]]
 name = "rand"
 version = "0.8.4"
@@ -1655,6 +1934,16 @@ dependencies = [
 "getrandom",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.3.1"
@@ -1664,6 +1953,31 @@ dependencies = [
 "rand_core",
 ]

+[[package]]
+name = "rayon"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
+dependencies = [
+ "autocfg",
+ "crossbeam-deque",
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-deque",
+ "crossbeam-utils",
+ "lazy_static",
+ "num_cpus",
+]
+
 [[package]]
 name = "rcgen"
 version = "0.8.14"
@@ -2105,6 +2419,12 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"

+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -2144,6 +2464,12 @@ dependencies = [
 "unicode-xid",
 ]

+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "tar"
 version = "0.4.38"
@@ -2233,6 +2559,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.5.1"
@@ -2797,6 +3133,12 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "wyz"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
+
 [[package]]
 name = "xattr"
 version = "0.2.2"
@@ -2855,6 +3197,7 @@ dependencies = [
 "bincode",
 "byteorder",
 "bytes",
+ "criterion",
 "git-version",
 "hex",
 "hex-literal",
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -0,0 +1,186 @@
+# Zenith storage node — alternative
+
+## **Design considerations**
+
+Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud.
+
+Proposed architecture addresses:
+
+- High availability -- tolerates n/2 - 1 failures
+- Multi-tenancy -- one storage for all databases
+- Elasticity -- increase storage size on the go by adding nodes
+- Snapshots / backups / PITR with S3 offload
+- Compression
+
+Minuses are:
+
+- Quite a lot of work
+- Single page access may touch few disk pages
+- Some bloat in data — may slowdown sequential scans
+
+## **Summary**
+
+Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories:
+
+```
+
+|-chunk_42/
+  |-store/ -- contains lsm with pages/pagediffs ranging from
+  |	      page_key_lo to page_key_hi
+  |-wal/
+  |  |- db_1234/ db-specific wal files with pages from page_key_lo
+  |		 to page_key_hi
+  |
+  |-chunk.meta -- small file with snapshot references
+		  (page_key_prefix+lsn+name)
+		  and PITR regions (page_key_start, page_key_end)
+```
+
+## **Chunk**
+
+Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields:
+
+- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs)
+- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance
+- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later
+- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space.
+- `(forkno, segno, pageno)` -- page coordinates in postgres data files
+- `lsn_timeline` -- postgres feature, increments when PITR was done.
+- `lsn` -- lsn of current page version.
+
+Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version.
+
+LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability.
+
+Content of SSTable can be following:
+
+```jsx
+(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff)
+(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data)
+```
+
+So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page.
+
+### **Page deletion**
+
+To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers.
+
+### **Recovery**
+
+Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk.
+
+### **Checkpointing**
+
+No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer.
+
+### **Full page writes (torn page protection)**
+
+Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue.
+
+### **Snapshot**
+
+That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity.
+
+It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance.
+
+Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space.
+
+**Starting db from snapshot**
+
+When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database).
+
+**Snapshot export/import**
+
+Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network.
+
+### **PITR area**
+
+In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc.
+
+PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there.
+
+### **Compression**
+
+Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity.
+
+### **Chunk metadata**
+
+Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers.
+
+### **Chunk splitting**
+
+*(NB: following paragraph is about how to avoid page splitting)*
+
+When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
+
+1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries.
+
+2. Prohibit WAL deletion and old SSTables deletion on original chunk.
+
+3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks.
+
+4. Start WAL replay on new chunks.
+
+5. Update global metadata about new chunk boundaries.
+
+6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes.
+
+7. New chunk may start serving read queries when following conditions are met:
+
+a) it receives at least on WAL record from processing node
+
+b) it replayed all WAL up to the new received one
+
+c) checked by downlinks that there were no WAL gaps.
+
+Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting.
+
+### Fixed chunks
+
+Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions.
+
+### **Chunk lsm internals**
+
+So how to implement chunk's lsm?
+
+- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype.
+- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla.
+
+Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS.
+
+Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important.
+
+# Storage fleet
+
+# **Storage fleet**
+
+- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range).
+
+<img width="937" alt="Screenshot_2021-02-22_at_16 49 17" src="https://user-images.githubusercontent.com/284219/108729836-ffcbd200-753b-11eb-9412-db802ec30021.png">
+
+Few databases are stored in one chunk, replicated three times
+
+- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster.
+
+<img width="940" alt="Screenshot_2021-02-22_at_16 49 10" src="https://user-images.githubusercontent.com/284219/108729815-fb071e00-753b-11eb-86e0-be6703e47d82.png">
+
+Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel.
+
+## **Chunk placement strategies**
+
+There are few scenarios where we may want to move chunks around the cluster:
+
+- disk usage on some node is big
+- some disk experienced a failure
+- some node experienced a failure or need maintenance
+
+## **Chunk replication**
+
+Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split.
+
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -0,0 +1,267 @@
+# Command line interface (end-user)
+
+Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+
+This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
+
+The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots.
+
+# Possible usage scenarios
+
+## Install zenith, run a postgres
+
+```
+> brew install pg-zenith 
+> zenith pg create # creates pgdata with default pattern pgdata$i
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       0G      zenith-local       localhost:5432
+```
+
+## Import standalone postgres to zenith
+
+```
+> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+[====================------------] 60% | 20MB/s
+> zenith snapshot list
+ID          SIZE        PARENT
+oldpg       5G          -
+
+> zenith pg create --snapshot oldpg
+Started postgres on localhost:5432
+
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot destroy oldpg
+Ok
+```
+
+Also, we may start snapshot import implicitly by looking at snapshot schema
+
+```
+> zenith pg create --snapshot basebackup://replication@localhost:5432/
+Downloading snapshot... Done.
+Started postgres on localhost:5432
+Destroying snapshot... Done.
+```
+
+## Pull snapshot with some publicly shared database
+
+Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
+
+```
+> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+```
+
+## Create snapshot and push it to the cloud
+
+```
+> zenith snapshot create pgdata1@snap1
+> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+```
+
+## Rollback database to the snapshot
+
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+
+```
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot create pgdata1@snap1
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@CURRENT       6G          -
+
+> zenith pg checkout pgdata1@snap1
+Stopping postgres on pgdata1.
+Rolling back pgdata1@CURRENT to pgdata1@snap1.
+Starting postgres on pgdata1.
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@HEAD{0}       6G          -
+pgdata1@CURRENT       6G          -
+```
+
+Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout).
+
+## Configure PITR area (Point In Time Recovery).
+
+PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
+
+```
+> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+```
+
+Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
+
+# Manual
+
+## storage
+
+Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+
+**zenith storage attach** -t [native|s3] -c key=value -n name
+
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+
+
+**zenith storage list**
+
+Show currently attached storages. For example:
+
+```
+> zenith storage list
+NAME            USED    TYPE                OPTIONS          PATH
+local           5.1G    zenith-local                         /opt/zenith/store/local
+local.compr     20.4G   zenith-local        comression=on    /opt/zenith/store/local.compr
+zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+s3tank          80G     S3
+```
+
+**zenith storage detach**
+
+**zenith storage show**
+
+
+
+## pg
+
+Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself.
+
+Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together.
+
+**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+
+Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
+
+--no-start: just init datadir without creating 
+
+--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+
+--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
+
+**zenith pg destroy**
+
+**zenith pg start** [--replica] pgdata
+
+Start postgres with proper extensions preloaded/installed.
+
+**zenith pg checkout**
+
+Rollback data directory to some previous snapshot. 
+
+**zenith pg stop** pg_id
+
+**zenith pg list**
+
+```
+ROLE                 PGDATA        USED    STORAGE            ENDPOINT
+primary              my_pg         5.1G    local              localhost:5432
+replica-1                                                     localhost:5433
+replica-2                                                     localhost:5434
+primary              my_pg2        3.2G    local.compr        localhost:5435
+-                    my_pg3        9.2G    local.compr        -
+```
+
+**zenith pg show**
+
+```
+my_pg:
+    storage: local
+    space used on local: 5.1G
+    space used on all storages: 15.1G
+    snapshots:
+        on local:
+            snap1: 1G
+            snap2: 1G
+        on zcloud:
+            snap2: 1G
+        on s3tank:
+            snap5: 2G
+    pitr:
+        on s3tank:
+            pitr_one_month: 45G
+
+```
+
+**zenith pg start-rest/graphql** pgdata
+
+Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
+
+
+## snapshot
+
+Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
+
+**zenith snapshot create** pgdata_name@snap_name
+
+Creates a new snapshot in the same storage where pgdata_name exists.
+
+**zenith snapshot push** --to url pgdata_name@snap_name
+
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+
+**zenith snapshot recv**
+
+Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
+
+**zenith snapshot pull** --from url or path
+
+Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+
+**zenith snapshot import** --from basebackup://<...>  or path
+
+Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
+
+**zenith snapshot export**
+
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+
+**zenith snapshot diff** snap1 snap2
+
+Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
+
+**zenith snapshot destroy**
+
+## pitr
+
+Pitr represents wal stream and ttl policy for that stream
+
+XXX: any suggestions on a better name?
+
+**zenith pitr create** name
+
+--ttl = inf | period
+
+--size-limit = inf | limit
+
+--storage = storage_name
+
+**zenith pitr extract-snapshot** pitr_name --lsn xxx
+
+Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
+
+**zenith pitr gc** pitr_name
+
+Force garbage collection on some PITR area.
+
+**zenith pitr list**
+
+**zenith pitr destroy**
+
+
+## console
+
+**zenith console**
+
+Opens browser targeted at web console with the more or less same functionality as described here.
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -0,0 +1,218 @@
+Durability & Consensus
+======================
+
+When a transaction commits, a commit record is generated in the WAL.
+When do we consider the WAL record as durable, so that we can
+acknowledge the commit to the client and be reasonably certain that we
+will not lose the transaction?
+
+Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+A WAL record is considered durable, when it has been written to a
+majority of WAL safekeeper nodes. In this document, I use 5
+safekeepers, because I have five fingers. A WAL record is durable,
+when at least 3 safekeepers have written it to disk.
+
+First, assume that only one primary node can be running at a
+time. This can be achieved by Kubernetes or etcd or some
+cloud-provider specific facility, or we can implement it
+ourselves. These options are discussed in later chapters.  For now,
+assume that there is a Magic STONITH Fairy that ensures that.
+
+In addition to the WAL safekeeper nodes, the WAL is archived in
+S3. WAL that has been archived to S3 can be removed from the
+safekeepers, so the safekeepers don't need a lot of disk space.
+
+
+                                +----------------+
+                        +-----> | WAL safekeeper |
+                        |       +----------------+
+                        |       +----------------+
+                        +-----> | WAL safekeeper |
+------------+          |       +----------------+
+|  Primary   |          |       +----------------+
+| Processing | ---------+-----> | WAL safekeeper |
+|   Node     |          |       +----------------+
+------------+          |       +----------------+
+            \           +-----> | WAL safekeeper |
+             \          |       +----------------+
+              \         |       +----------------+
+               \        +-----> | WAL safekeeper |
+                \               +----------------+
+                 \
+                  \
+                   \
+                    \
+                     \      +--------+
+					  \		|        |
+					   +-->	|   S3   |
+							|        |
+                            +--------+
+
+
+Every WAL safekeeper holds a section of WAL, and a VCL value.
+The WAL can be divided into three portions:
+
+
+                                    VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+
+
+Note that all this WAL kept in a safekeeper is a contiguous section.
+This is different from Aurora: In Aurora, there can be holes in the
+WAL, and there is a Gossip protocol to fill the holes. That could be
+implemented in the future, but let's keep it simple for now. WAL needs
+to be written to a safekeeper in order. However, during crash
+recovery, In-flight WAL that has already been stored in a safekeeper
+can be truncated or overwritten.
+
+The Archived WAL has already been stored in S3, and can be removed from
+the safekeeper.
+
+The Completed WAL has been written to at least three safekeepers. The
+algorithm ensures that it is not lost, when at most two nodes fail at
+the same time.
+
+The In-flight WAL has been persisted in the safekeeper, but if a crash
+happens, it may still be overwritten or truncated.
+
+
+The VCL point is determined in the Primary. It is not strictly
+necessary to store it in the safekeepers, but it allows some
+optimizations and sanity checks and is probably generally useful for
+the system as whole. The VCL values stored in the safekeepers can lag
+behind the VCL computed by the primary.
+
+
+Primary node Normal operation
+-----------------------------
+
+1. Generate some WAL.
+
+2. Send the WAL to all the safekeepers that you can reach.
+
+3. As soon as a quorum of safekeepers have acknowledged that they have
+   received and durably stored the WAL up to that LSN, update local VCL
+   value in memory, and acknowledge commits to the clients.
+
+4. Send the new VCL to all the safekeepers that were part of the quorum.
+   (Optional)
+
+
+Primary Crash recovery
+----------------------
+
+When a new Primary node starts up, before it can generate any new WAL
+it needs to contact a majority of the WAL safekeepers to compute the
+VCL. Remember that there is a Magic STONITH fairy that ensures that
+only node process can be doing this at a time.
+
+1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you
+   can reach. This is the Winner safekeeper, and its LSN becomes the new VCL.
+
+2. Update the other safekeepers you can reach, by copying all the WAL
+   from the Winner, starting from each safekeeper's old VCL point. Any old
+   In-Flight WAL from previous Epoch is truncated away.
+
+3. Increment Epoch, and send the new Epoch to the quorum of
+   safekeepers.  (This ensures that if any of the safekeepers that we
+   could not reach later come back online, they will be considered as
+   older than this in any future recovery)
+
+You can now start generating new WAL, starting from the newly-computed
+VCL.
+
+Optimizations
+-------------
+
+As described, the Primary node sends all the WAL to all the WAL safekeepers. That
+can be a lot of network traffic. Instead of sending the WAL directly from Primary,
+some safekeepers can be daisy-chained off other safekeepers, or there can be a
+broadcast mechanism among them. There should still be a direct connection from the
+each safekeeper to the Primary for the acknowledgments though.
+
+Similarly, the responsibility for archiving WAL to S3 can be delegated to one of
+the safekeepers, to reduce the load on the primary.
+
+
+Magic STONITH fairy
+-------------------
+
+Now that we have a system that works as long as only one primary node is running at a time, how
+do we ensure that?
+
+1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary
+   when it's holding a valid lease. If the primary node dies, the lease expires after a timeout
+   period, and a new node is allowed to become the primary.
+
+2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you
+   cannot do this safely. In practice, it would probably be OK if you make the lease times and
+   timeouts long enough. This has the advantage that we don't need to introduce a new
+   component to the architecture.
+
+3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The
+   next chapter describes this option.
+
+
+Built-in Paxos
+--------------
+
+The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes
+as both Proposers and Learners.
+
+Each WAL safekeeper holds an Epoch value in addition to the VCL and
+the WAL. Each request by the primary to safekeep WAL is accompanied by
+an Epoch value. If a safekeeper receives a request with Epoch that
+doesn't match its current Accepted Epoch, it must ignore (NACK) it.
+(In different Paxos papers, Epochs are called "terms" or "round
+numbers")
+
+When a node wants to become the primary, it generates a new Epoch
+value that is higher than any previously observed Epoch value, and
+globally unique.
+
+
+Accepted Epoch: 555                VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+
+
+Primary node startup:
+
+1. Contact all WAL safekeepers that you can reach (if you cannot
+   connect to a quorum of them, you can give up immediately). Find the
+   latest Epoch among them.
+
+2. Generate a new globally unique Epoch, greater than the latest Epoch
+   found in previous step.
+
+2. Send the new Epoch in a Prepare message to a quorum of
+   safekeepers. (PAXOS Prepare message)
+
+3. Each safekeeper responds with a Promise. If a safekeeper has
+   already made a promise with a higher Epoch, it doesn't respond (or
+   responds with a NACK). After making a promise, the safekeeper stops
+   responding to any write requests with earlier Epoch.
+
+4. Once you have received a majority of promises, you know that the
+   VCL cannot advance on the old Epoch anymore. This effectively kills
+   any old primary server.
+
+5. Find the highest written LSN among the quorum of safekeepers (these
+   can be included in the Promise messages already). This is the new
+   VCL.  If a new node starts the election process after this point,
+   it will compute the same or higher VCL.
+
+6. Copy the WAL from the safekeeper with the highest LSN to the other
+   safekeepers in the quorum, using the new Epoch. (PAXOS Accept
+   phase)
+
+7. You can now start generating new WAL starting from the VCL. If
+   another process starts the election process after this point and
+   gains control of a majority of the safekeepers, we will no longer
+   be able to advance the VCL.
+
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -0,0 +1,103 @@
+# Zenith local
+
+Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+
+#### Why do we need it?
+- For distribution - this easy to use binary will help us to build adoption among developers.
+- For internal use - to test all components together.
+
+In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+> Question: How much should we care about durability and security issues for a local setup?
+
+
+#### Why is it better than a simple local postgres?
+
+- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+
+- Quick and cheap creation of compute nodes over the same storage.
+> Question: How can we describe a use-case for this feature?
+
+- Zenith-local can work with S3 directly. 
+
+- Push and pull images (snapshots) to remote S3 to exchange data with other users.
+
+- Quick and cheap snapshot checkouts to switch back and forth in the database history.
+> Question: Do we want it in the very first release? This feature seems quite complicated.
+
+#### Distribution:
+
+Ideally, just one binary that incorporates all elements we need.
+> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL.
+
+#### Components:
+
+- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responces to show them in a user-friendly way.  
+CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+
+- **zenith-console** - WEB UI with same functionality as CLI.
+>Note: not for the first release.
+
+- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+
+- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+
+- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
+> Question: Do we use it together with local page store or they are interchangeable?
+
+WIP code is ???
+
+- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+
+- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+ 
+ WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+
+#### REST API:
+
+Service endpoint: `http://localhost:3000`
+
+Resources:
+- /storages - Where data lives: zenith-pageserver or zenith-s3
+- /pgs - Postgres - zenith-computenode
+- /snapshots - snapshots **TODO**
+
+>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+
+Methods and their mapping to CLI:
+
+- /storages - zenith-pageserver or zenith-s3
+
+CLI  | REST API
+------------- | -------------
+storage attach -n name --type [native\s3]  --path=[datadir\URL] | PUT  -d { "name": "name", "type": "native", "path": "/tmp" } /storages
+storage detach -n name | DELETE /storages/:storage_name 
+storage list | GET /storages
+storage show -n name | GET /storages/:storage_name 
+
+
+- /pgs - zenith-computenode
+
+CLI  | REST API
+------------- | -------------
+pg create -n name --s storage_name | PUT  -d { "name": "name", "storage_name": "storage_name" } /pgs
+pg destroy -n name | DELETE /pgs/:pg_name 
+pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"}  /pgs/:pg_name /actions
+pg stop -n name | POST  -d {"action": "stop"}  /pgs/:pg_name /actions
+pg promote -n name | POST  -d {"action": "promote"}  /pgs/:pg_name /actions
+pg list | GET /pgs
+pg show -n name | GET /pgs/:pg_name 
+
+- /snapshots **TODO**
+
+CLI  | REST API
+------------- | -------------
+
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -0,0 +1,64 @@
+Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+
+# CLI v2 (after chatting with Carl)
+
+Zenith introduces the notion of a repository.
+
+```bash
+zenith init
+zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+```
+
+Once you have a cluster catalog you can explore it
+
+```bash
+zenith log -- returns a list of commits
+zenith status -- returns if there are changes in the catalog that can be committed
+zenith commit -- commits the changes and generates a new commit hash
+zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+```
+
+To make changes in the catalog you need to run compute nodes
+
+```bash
+-- here is how you a compute node
+zenith start /home/pipedpiper/northwind:main -- starts a compute instance
+zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port)
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start anothe compute instance (on different port)
+
+-- After running some DML you can run 
+-- zenith status and see how there are two WAL streams one on top of 
+-- the main branch
+zenith status 
+-- and another on top of the experimental branch
+zenith status -b experimental
+
+-- you can commit each branch separately
+zenith commit main
+-- or
+zenith commit -c /home/pipedpiper/northwind:experimental
+```
+
+Starting compute instances against cloud environments
+
+```bash
+-- you can start a compute instance against the cloud environment
+-- in this case all of the changes will be streamed into the cloud
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith status -c https://zenith:tech/pipedpiper/northwind:main
+zenith commit -c https://zenith:tech/pipedpiper/northwind:main
+zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+```
+
+Pushing data into the cloud
+
+```bash
+-- pull all the commits from the cloud
+zenith pull
+-- push all the commits to the cloud
+zenith push
+```
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -0,0 +1,140 @@
+# Repository format
+
+A Zenith repository is similar to a traditional PostgreSQL backup
+archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
+multiple versions of a PostgreSQL database cluster.
+
+The distinguishing feature is that you can launch a Zenith Postgres
+server directly against a branch in the repository, without having to
+"restore" it first. Also, Zenith manages the storage automatically,
+there is no separation between full and incremental backups nor WAL
+archive. Zenith relies heavily on the WAL, and uses concepts similar
+to incremental backups and WAL archiving internally, but it is hidden
+from the user.
+
+## Directory structure, version 1
+
+This first version is pretty straightforward but not very
+efficient. Just something to get us started.
+
+The repository directory looks like this:
+
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    
+    .zenith/refs/branches/mybranch
+    .zenith/refs/tags/foo
+    .zenith/refs/tags/bar
+    
+    .zenith/datadirs/<timeline uuid>
+
+### Timelines
+
+A timeline is similar to PostgeSQL's timeline, but is identified by a
+UUID instead of a 32-bit timeline Id.  For user convenience, it can be
+given a name that refers to the UUID (called a branch).
+
+All WAL is generated on a timeline. You can launch a read-only node
+against a tag or arbitrary LSN on a timeline, but in order to write,
+you need to create a timeline.
+
+Each timeline is stored in a directory under .zenith/timelines. It
+consists of a WAL archive, containing all the WAL in the standard
+PostgreSQL format, under the wal/ subdirectory.
+
+The 'snapshots/' subdirectory, contains "base backups" of the data
+directory at a different LSNs. Each snapshot is simply a copy of the
+Postgres data directory.
+
+When a new timeline is forked from a previous timeline, the ancestor
+timeline's UUID is stored in the 'history' file.
+
+### Refs
+
+There are two kinds of named objects in the repository: branches and
+tags.  A branch is a human-friendly name for a timeline UUID, and a
+tag is a human-friendly name for a specific LSN on a timeline
+(timeline UUID + LSN).  Like in git, these are just for user
+convenience; you can also use timeline UUIDs and LSNs directly.
+
+Refs do have one additional purpose though: naming a timeline or LSN
+prevents it from being automatically garbage collected.
+
+The refs directory contains a small text file for each tag/branch. It
+contains the UUID of the timeline (and LSN, for tags).
+
+### Datadirs
+
+.zenith/datadirs contains PostgreSQL data directories. You can launch
+a Postgres instance on one of them with:
+
+```
+  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+```
+
+All the actual data is kept in the timeline directories, under
+.zenith/timelines. The data directories are only needed for active
+PostgreQSL instances. After an instance is stopped, the data directory
+can be safely removed. "zenith start" will recreate it quickly from
+the data in .zenith/timelines, if it's missing.
+
+## Version 2
+
+The format described above isn't very different from a traditional
+daily base backup + WAL archive configuration. The main difference is
+the nicer naming of branches and tags.
+
+That's not very efficient. For performance, we need something like
+incremental backups that don't require making a full copy of all
+data. So only store modified files or pages. And instead of having to
+replay all WAL from the last snapshot, "slice" the WAL into
+per-relation WAL files and only recover what's needed when a table is
+accessed.
+
+In version 2, the file format in the "snapshots" subdirectory gets
+more advanced. The exact format is TODO. But it should support:
+- storing WAL records of individual relations/pages
+- storing a delta from an older snapshot
+- compression
+
+
+## Operations
+
+### Garbage collection
+
+When you run "zenith gc", old timelines that are no longer needed are
+removed. That involves collecting the list of "unreachable" objects,
+starting from the named branches and tags.
+
+Also, if enough WAL has been generated on a timeline since last
+snapshot, a new snapshot or delta is created.
+
+### zenith push/pull
+
+Compare the tags and branches on both servers, and copy missing ones.
+For each branch, compare the timeline it points to in both servers. If
+one is behind the other, copy the missing parts.
+
+FIXME: how do you prevent confusion if you have to clones of the same
+repository, launch an instance on the same branch in both clones, and
+later try to push/pull between them? Perhaps create a new timeline
+every time you start up an instance? Then you would detect that the
+timelines have diverged. That would match with the "epoch" concept
+that we have in the WAL safekeepr
+
+### zenith checkout/commit
+
+In this format, there is no concept of a "working tree", and hence no
+concept of checking out or committing. All modifications are done on
+a branch or a timeline. As soon as you launch a server, the changes are
+appended to the timeline.
+
+You can easily fork off a temporary timeline to emulate a "working tree".
+You can later remove it and have it garbage collected, or to "commit",
+re-point the branch to the new timeline.
+
+If we want to have a worktree and "zenith checkout/commit" concept, we can
+emulate that with a temporary timeline. Create the temporary timeline at
+"zenith checkout", and have "zenith commit" modify the branch to point to
+the new timeline.
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -0,0 +1,93 @@
+How it works now
+----------------
+
+1. Create repository, start page server on it
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create a branch, and start a Postgres instance on it
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+
+$ zenith pg create heikki
+Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+
+$ zenith pg start pg1
+Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
+waiting for server to start.... done
+server started
+```
+
+
+3. Connect to it and run queries
+
+```
+$ psql "dbname=postgres port=55432"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+Proposal: Serverless on your Laptop
+-----------------------------------
+
+We've been talking about doing the "pg create" step automatically at
+"pg start", to eliminate that step. What if we go further, go
+serverless on your laptop, so that the workflow becomes just:
+
+1. Create repository, start page server on it (same as before)
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create branch
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+```
+
+3. Connect to it:
+
+```
+$ psql "dbname=postgres port=5432 branch=heikki"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+The trick behind the scenes is that when you launch the page server,
+it starts to listen on port 5432. When you connect to it with psql, it
+looks at the 'branch' parameter that you passed in the connection
+string. It automatically performs the "pg create" and "pg start" steps
+for that branch, and then forwards the connection to the Postgres
+instance that it launched. After you disconnect, if there are no more
+active connections to the server running on the branch, it can
+automatically shut it down again.
+
+This is how serverless would work in the cloud. We can do it on your
+laptop, too.
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -0,0 +1,66 @@
+# Push and pull between pageservers
+
+Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal.
+
+## Origin management
+
+The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
+
+```
+zenith origin add <name> <connection_uri>
+zenith origin list
+zenith origin remove <name>
+```
+
+Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
+
+Behind the scenes, this commands may update toml file inside .zenith directory.
+
+## Push
+
+### Pushing branch
+
+```
+zenith push mybranch cloudserver # push to eponymous branch in cloudserver
+zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+```
+
+Exact mechanics would be slightly different in the following situations:
+
+1) Destination branch does not exist.
+
+    That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*].
+
+    The exact mechanics may be the following:
+
+    * CLI asks local pageserver to perform push and hands over connection uri: `perform_push <branch_name> <uri>`.
+    * local pageserver connects to the remote pageserver and runs `branch_push <branch_name> <timetine_id>`
+        Handler for branch_create would create destination timeline and switch connection to copyboth mode.
+    * Sending pageserver may start iterator on that timeline and send all the records as copy messages.
+
+2) Destination branch exists and latest_valid_lsn is less than ours.
+
+    In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan.
+
+3) Destination branch exists and latest_valid_lsn is bigger than ours.
+
+    In this case, we can't push to that branch. We can only pull.
+
+### Pulling branch
+
+Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands:
+
+* CLI calls `perform_pull <branch_name> <uri>` on local pageserver.
+* local pageserver calls `branch_pull <branch_name> <timetine_id>` on remote pageserver.
+* remote pageserver sends records in our direction
+
+But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push.
+
+
+
+[*] It looks to me that there are two different possible approaches to handling unrelated timelines:
+
+1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not.
+2) Transparently create and manage several repositories in one pageserver.
+
+But that is the topic for a separate RFC/discussion.
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -0,0 +1,56 @@
+While working on export/import commands, I understood that they fit really well into "snapshot-first design".
+
+We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
+
+Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith.
+
+So here is an attemt to design consistent CLI for diferent usage scenarios:
+
+#### 1. Start empty pageserver.
+That is what we have now.
+Init empty pageserver using `initdb` in temporary directory.
+
+`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
+
+Save`storage_dest` and other parameters in config. 
+Push snapshots to `storage_dest` in background.
+
+```
+zenith init --storage_dest=S3_PREFIX
+zenith start
+```
+
+#### 2. Restart pageserver (manually or crash-recovery).
+Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. 
+Push snapshots to `storage_dest` in background.
+
+```
+zenith start
+```
+
+#### 3. Import.
+Start pageserver from existing snapshot.
+Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
+Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
+Save`storage_dest` parameters in config. 
+Push snapshots to `storage_dest` in background.
+```
+//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
+zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+zenith start
+```
+How to pass credentials needed for `snapshot_path`?
+
+#### 4. Export.
+Manually push snapshot to `snapshot_path` which differs from `storage_dest` 
+Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+```
+zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+```
+
+#### Notes and questions
+- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
+- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- We can think of better names for all options.
+- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
+I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
--- a/docs/rfcs/009-snapshot-first-storage-pitr.md
+++ b/docs/rfcs/009-snapshot-first-storage-pitr.md
@@ -0,0 +1,227 @@
+# Preface
+
+GetPage@LSN can be called with older LSNs, and the page server needs
+to be able to reconstruct older page versions. That's needed for
+having read-only replicas that lag behind the primary, or that are
+"anchored" at an older LSN, and internally in the page server whne you
+branch at an older point in time. How do you do that?
+
+For now, I'm not considering incremental snapshots at all. I don't
+think that changes things. So whenever you create a snapshot or a
+snapshot file, it contains an image of all the pages, there is no need
+to look at an older snapshot file.
+
+Also, I'm imagining that this works on a per-relation basis, so that
+each snapshot file contains data for one relation. A "relation" is a
+fuzzy concept - it could actually be one 1 GB relation segment. Or it
+could include all the different "forks" of a relation, or you could
+treat each fork as a separate relation for storage purpose. And once
+we have the "non-relational" work is finished, a "relation" could
+actually mean some other versioned object kept in the PostgreSQL data
+directory. Let's ignore that for now.
+
+# Eric's RFC:
+
+Every now and then, you create a "snapshot". It means that you create
+a new snapshot file for each relation that was modified after the last
+snapshot, and write out the contents the relation as it is/was at the
+snapshot LSN. Write-ahead log is stored separately in S3 by the WAL
+safekeeping service, in the original PostgreSQL WAL file format.
+
+    SNAPSHOT @100       WAL
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 V
+    IN-MEMORY @400
+
+If a GetPage@LSN request comes from the primary, you return the latest
+page from the in-memory layer. If there is no trace of the page in
+memory, it means that it hasn't been modified since the last snapshot,
+so you return the page from the latest snapshot, at LSN 300 in the
+above example.
+
+PITR is implemented using the original WAL files:
+
+If a GetPage@LSN request comes from a read replica with LSN 250, you
+read the image of the page from the snapshot at LSN 200, and you also
+scan the WAL between 200 and 250, and apply all WAL records for the
+requested page, to reconstruct it at LSN 250.
+
+Scanning the WAL naively for every GetPage@LSN request would be
+expensive, so in practice you'd construct an in-memory data structure
+of all the WAL between 200 and 250 once that allows quickly looking up
+records for a given page.
+
+## Problems/questions
+
+I think you'll need to store the list of snapshot LSNs on each
+timeline somewhere.
+
+If the latest snapshot of a relation is at LSN 100, and you request a
+page at LSN 1000000, how do you know if there are some modifications
+to it between 100 and 1000000 that you need to replay? You can scan
+all the WAL between 100 and 1000000, but that would be expensive.
+
+You can skip that, if you know that a snapshot was taken e.g. at LSN
+999900. Then you know that the fact that there is no snapshot file at
+999900 means that the relation hasn't been modified between
+100-999900.  Then you only need to scan the WAL between 999900 and
+1000000. However, there is no trace of a snapshot happening at LSN
+999900 in the snapshot file for this relation, so you need to get
+that information from somewhere else.
+
+Where do you get that information from? Perhaps you can scan all the
+other relations, and if you see a snapshot file for *any* relation at
+LSN 999900, you know that if there were modifications to this
+relation, there would be a newer snapshot file for it, too. In other
+words, the list of snapshots that have been taken can be constructed
+by scanning all relations and computing the union of all snapshot LSNs
+that you see for any relation. But that's expensive so at least you
+should keep that in memory, after computing it once. Also, if you rely
+on that, it's not possible to have snapshots at different intervals
+for different files. That seems limiting.
+
+Another option is to explicitly store a list of snapshot LSNs in a
+separate metadata file.
+
+
+# Current implementation in the 'layered_repo' branch:
+
+We store snapshot files like in the RFC, but each snapshot file also
+contains all the WAL in the range of LSNs, so that you don't need to
+fetch the WAL separately from S3. So you have "layers" like this:
+
+    SNAPSHOT+WAL 100-200
+          |
+          |
+          |
+          |
+    SNAPSHOT+WAL 200-300
+          |
+          |
+          |
+          |
+    IN-MEMORY 300-
+
+Each "snapshot+WAL" is a file that contains a snapshot - i.e. full
+copy of each page in the relation, at the *start* LSN. In addition to
+that, it contains all the WAL applicable to the relation from the
+start LSN to the end LSN. With that, you can reconstruct any page
+version in the range that the file covers.
+
+
+## Problems/questions
+
+I can see one potential performance issue here, compared to the RFC.
+Let's focus on a single relation for now. Imagine that you start from
+an empty relation, and you receive WAL from 100 to 200, containing
+a bunch of inserts and updates to the relation. You now have all that
+WAL in memory:
+
+    memory:  WAL from 100-200
+
+We decide that it's time to materialize that to a snapshot file on
+disk.  We materialize full image of the relation as it was at LSN 100
+to the snapshot file, and include all of the WAL. Since the relation
+was initially empty, the "image" at the beginning of th range is empty
+too.
+
+So now you have one file on on disk:
+
+    SNAPSHOT+WAL 100-200
+
+It contains a full image of the relation at LSN 100 and all WAL
+between 100-200. (It's actually stored as a serialized BTreeMap of
+page versions, with the page images and WAL records all stored
+together in the same BtreeMap. But for this story, that's not
+important.)
+
+We now receive more WAL updating the relation, up to LSN 300. We
+decide it's time to materialize a new snapshot file, and we now have
+two files:
+
+    SNAPSHOT+WAL 100-200
+    SNAPSHOT+WAL 200-300
+
+Note that the latest "full snapshot" that we store on disk always lags
+behind by one snapshot cycle. The first file contains a full image of
+the relation at LSN 100, the second at LSN 200. When we have received
+WAL up to LSN 300, we write a materialized image at LSN 200. That
+seems a bit silly. In the design per your RFC, you would write a
+snapshots at LSNs 200 and 300, instead. That seems better.
+
+
+
+# Third option (not implemented yet)
+
+Store snapshot files like in the RFC, but also store per-relation
+WAL files that contain WAL in a range of LSNs for that relation.
+
+    SNAPSHOT @100   WAL 100-200
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200   WAL 200-300
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300
+       .
+       .
+    IN-MEMORY 300-
+
+
+This could be the best of both worlds. The snapshot files would be
+independent of the PostgreSQL WAL format. When it's time to write
+snapshot file @300, you write a full image of the relation at LSN 300,
+and you write the WAL that you had accumulated between 200 and 300 to
+a separate file. That way, you don't "lag behind" for one snapshot
+cycle like in the current implementation. But you still have the WAL
+for a particular relation readily available alongside the snapshot
+files, and you don't need to track what snapshot LSNs exist
+separately.
+
+(If we wanted to minize the number of files, you could include the
+snapshot @300 and the WAL between 200 and 300 in the same file, but I
+feel it's probably better to keep them separate)
+
+
+
+# Further thoughts
+
+There's no fundamental reason why the LSNs of the snapshot files and the
+ranges of the WAL files would need to line up. So this would be possible
+too:
+
+    SNAPSHOT @100   WAL 100-150
+       .                 |
+       .                 |
+       .            WAL 150-250
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .            WAL 250-400
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 |
+    IN-MEMORY 300-
+
+I'm not sure what the benefit of this would be. You could materialize
+additional snapshot files in the middle of a range covered by a WAL
+file, maybe? Might be useful to speed up access when you create a new
+branch in the middle of an LSN range or if there's some other reason
+to believe that a particular LSN is "interesting" and there will be
+a lot of requests using it.
--- a/docs/rfcs/009-snapshot-first-storage.md
+++ b/docs/rfcs/009-snapshot-first-storage.md
@@ -0,0 +1,148 @@
+# Snapshot-first storage architecture
+
+Goals:
+- Long-term storage of database pages.
+- Easy snapshots; simple snapshot and branch management.
+- Allow cloud-based snapshot/branch management.
+- Allow cloud-centric branching; decouple branch state from running pageserver.
+- Allow customer ownership of data via s3 permissions.
+- Provide same or better performance for typical workloads, vs plain postgres.
+
+Non-goals:
+- Service database reads from s3 (reads should be serviced from the pageserver cache).
+- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot).
+
+## Principle of operation
+
+The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3.
+
+In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere.
+
+The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not.
+
+It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now.
+
+Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling.
+
+Objects in s3 are immutable snapshots, never to be modified once written (only deleted).
+
+Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low.
+
+It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance.
+
+No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots.
+
+A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica).
+
+WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.)
+
+## Pageserver operation
+
+To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed.
+
+To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down.
+
+It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch.
+
+The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot.
+
+The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.)
+
+If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches.
+
+The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions.
+
+The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow.
+
+The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal).
+
+A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot.
+
+## Cloud snapshot manager operation
+
+Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent):
+Create/delete/clone/rename a database
+Create a new branch (possibly from a historical snapshot)
+Start/stop the pageserver/safekeeper on a branch
+List databases/branches/snapshots that are visible to this user account
+
+Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries.
+
+This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries.
+
+## Snapshot names, deletion and concurrency
+
+There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone.
+
+For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails.  This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values.  `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded.
+
+## Branching
+
+A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen:
+- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch.
+- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object.
+    - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages.
+    - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data.
+- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice.
+
+Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same.
+
+## Long-term file format
+
+Snapshot files (and any other object stored in s3) must be readable by future software versions.
+
+It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management.
+
+Files should contain the following metadata, in addition to the set of pages:
+- The version of the file format.
+- A unique identifier for this branch (should be worldwide-unique and unchanging).
+- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging).
+- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges).
+- The location of the predecessor branch snapshot, if different from this branch’s location.
+- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0.
+- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle).
+- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity.
+
+A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database.
+
+Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only.
+
+## S3 semantics, and other kinds of storage
+
+For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket.
+
+Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either.
+
+Alternate implementations of s3 should be supported, including Google Cloud Storage.
+
+Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose.
+
+The properties of s3 that we depend on are:
+list objects
+streaming read of entire object
+read byte range from object
+streaming write new object (may use multipart upload for better relialibity)
+delete object (that should not disrupt an already-started read).
+
+Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully.
+
+## Notes
+
+Possible simplifications, for a first draft implementation:
+- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later.
+- Don’t worry about the details of the squashing process yet.
+- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads.
+- Don’t implement rename, delete at first.
+- Don’t implement public/private, just use s3 permissions.
+- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data.
+- Don’t worry about history that spans multiple buckets.
+- Don’t worry about s3 regions.
+- Don’t support user-writeable s3 buckets; users get only read-only access at most.
+
+Open questions:
+- How important is point-in-time recovery? When should we add this? How should it work?
+- Should snapshot files use compression?
+- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created.
+- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy?
+- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver?
+- How can pageserver software upgrade be done with minimal downtime?
--- a/docs/rfcs/010-storage_details.md
+++ b/docs/rfcs/010-storage_details.md
@@ -0,0 +1,144 @@
+# Storage details
+
+Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details.
+
+## Overview
+
+![storage](images/storage.jpeg)
+
+### MemStore
+
+MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL.
+
+### PageIndex
+
+PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset):
+
+* PageStoreRef -- page offset in the PageStore
+* LocalStoreRef -- snapshot_id and page offset inside of that snapshot
+* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore
+
+PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized.
+
+We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper.
+
+### WalStore
+
+WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory.
+
+For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server).
+
+### PageStore
+
+PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it.
+
+There are few possible options for PageStore:
+
+a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation.
+
+b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex.
+
+I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete.
+
+With option b) we can also treat PageStor as an uncompleted increamental snapshot.
+
+### LocalStore
+
+LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold.
+
+## Granularity
+
+By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account:
+
+* can we shard big databases between page servers?
+* how much time will we spend applying WAL to access certain pages with older LSN's?
+* how many files do we create for a single database?
+
+I can think of the following options here:
+
+1. whole database goes to one full snapshot.
+    * +: we never create a lot of files for one database
+    * +: the approach is quite straightforward, moving data around is simple
+    * -: can not be sharded
+    * -: long recovery -- we always need to recover the whole database
+2. table segment is the unit of snapshotting
+    * +: straightforward for sharding
+    * +: individual segment can be quickly recovered with sliced WAL
+    * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big.
+3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots.
+    * +: addresses all mentioned issues
+    * -: harder to implement
+
+I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it.
+
+Both PageStore and WalStore should be "sharded" by this granularity level.
+
+## Security
+
+We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials.
+
+Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure.
+
+## Dynamics
+
+### WAL stream handling
+
+When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value.
+
+### getPage queries
+
+Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page.
+
+### Starting page server without local data
+
+* build set of latest full snapshots and incremental snapshots on top of them
+* load all their metadata into PageIndex
+* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot
+* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore).
+
+### Starting page server with local data (aka restart or reboot)
+
+* check that local snapshot files are consistent with S3
+
+### Snapshot creation
+
+Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore.
+
+To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation.
+
+Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots.
+
+### S3 pushdown
+
+When we have several full snapshots GC can push the old one with its increments to S3.
+
+### Branch creation
+
+Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky.
+
+## File formats
+
+As far as I understand Bookfile/Aversion addresses versioning and serialization parts.
+
+As for exact data that should go to snapshots I think it is the following for each snapshot:
+
+* format version number
+* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number.
+* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile
+* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records
+* pages, one by one
+* WAL records, one by one
+
+It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))).
+
+1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small).
+2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor.
+
+I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines.
+
+Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned:
+1. snapshot lsn=200, includes WAL in range 200-300
+2. snapshot lsn=200, includes WAL in range 100-200
+3. data snapshots are separated from WAL snapshots
+
+Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas).
--- a/docs/rfcs/011-retention-policy.md
+++ b/docs/rfcs/011-retention-policy.md
@@ -0,0 +1,91 @@
+# User-visible timeline history
+
+The user can specify a retention policy. The retention policy is
+presented to the user as a PITR period and snapshots. The PITR period
+is the amount of recent history that needs to be retained, as minutes,
+hours, or days. Within that period, you can create a branch or
+snapshot at any point in time, open a compute node, and start running
+queries. Internally, a PITR period is represented as a range of LSNs
+
+The user can also create snapshots. A snapshot is a point in time,
+internally represented by an LSN. The user gives the snapshot a name.
+
+The user can also specify an interval, at which the system creates
+snapshots automatically. For example, create a snapshot every night at
+2 AM. After some user-specified time, old automatically created
+snapshots are removed.
+
+                     Snapshot       Snapshot
+         PITR        "Monday"       "Tuesday"        PITR
+    ----######----------+-------------+-------------######>
+
+If there are multiple branches, you can specify different policies or
+different branches.
+
+The PITR period and user-visible snapshots together define the
+retention policy.
+
+NOTE: As presented here, this is probably overly flexible. In reality,
+we want to keep the user interface simple. Only allow a PITR period at
+the tip of a branch, for example. But that doesn't make much
+difference to the internals.
+
+
+# Retention policy behind the scenes
+
+The retention policy consists of points (for snapshots) and ranges
+(for PITR periods).
+
+The system must be able to reconstruct any page within the retention
+policy. Other page versions can be garbage collected away. We have a
+lot of flexibility on when to perform the garbage collection and how
+aggressive it is.
+
+
+# Base images and WAL slices
+
+The page versions are stored in two kinds of files: base images and
+WAL slices. A base image contains a dump of all the pages of one
+relation at a specific LSN. A WAL slice contains all the WAL in an LSN
+range.
+
+
+    |
+    |
+    |
+    | --Base img @100   +
+    |                   |
+    |                   | WAL slice
+    |                   | 100-200
+    |                   |
+    | --Base img @200   +
+    |                   |
+    |                   | WAL slice
+    |                   | 200-300
+    |                   |
+    |                   +
+    |
+    V
+
+
+To recover a page e.g. at LSN 150, you need the base image at LSN 100,
+and the WAL slice 100-200.
+
+All of this works at a per-relation or per-relation-segment basis. If
+a relation is updated very frequently, we create base images and WAL
+slices for it more quickly. For a relation that's updated
+infrequently, we hold the recent WAL for that relation longer, and
+only write it out when we need to release the disk space occupied by
+the original WAL. (We need a backstop like that, because until all the
+WAL/base images have been been durably copied to S3, we must keep the
+original WAL for that period somewhere, in the WAL service or in S3.)
+
+
+# Branching
+
+Internally, branch points are also "retention points", in addition to
+the user-visible snapshots. If a branch has been forked off at LSN
+100, we need to be able to reconstruct any page on the parent branch
+at that LSN, because it is needed by the child branch. If a page is
+modified in the child, we don't need to keep that in the parent
+anymore, though.
--- a/docs/rfcs/012-background-tasks.md
+++ b/docs/rfcs/012-background-tasks.md
@@ -0,0 +1,38 @@
+# Eviction
+
+ Write out in-memory layer to disk, into a delta layer.
+
+- To release memory
+- To make it possible to advance disk_consistent_lsn and allow the WAL
+  service to release some WAL.
+
+- Triggered if we are short on memory
+- Or if the oldest in-memory layer is so old that it's holding back
+  the WAL service from removing old WAL
+
+# Materialization
+
+Create a new image layer of a segment, by performing WAL redo
+
+- To reduce the amount of WAL that needs to be replayed on a GetPage request.
+- To allow garbage collection of old layers
+
+- Triggered by distance to last full image of a page
+
+# Coalescing
+
+Replace N consecutive layers of a segment with one larger layer.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Bundling
+
+Zip together multiple small files belonging to different segments.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Garbage collection
+
+Remove a layer that's older than the GC horizon, and isn't needed anymore.
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -0,0 +1,147 @@
+# What
+
+Currently, apart from WAL safekeeper persistently stores only two logical clock
+counter (aka term) values, sourced from the same sequence. The first is bumped
+whenever safekeeper gives vote to proposer (or acknowledges already elected one)
+and e.g. prevents electing two proposers with the same term -- it is actually
+called `term` in the code. The second, called `epoch`, reflects progress of log
+receival and this might lag behind `term`; safekeeper switches to epoch `n` when
+it has received all committed log records from all `< n` terms. This roughly
+correspones to proposed in
+
+https://github.com/zenithdb/rfcs/pull/3/files
+
+
+This makes our biggest our difference from Raft. In Raft, every log record is
+stamped with term in which it was generated; while we essentialy store in
+`epoch` only the term of the highest record on this safekeeper -- when we know
+it -- because during recovery generally we don't, and `epoch` is bumped directly
+to the term of the proposer who performs the recovery when it is finished. It is
+not immediately obvious that this simplification is safe. I thought and I still
+think it is; model checking confirmed that. However, some details now make me
+believe it is better to keep full term switching history (which is equivalent to
+knowing term of each record).
+
+# Why
+
+Without knowing full history (list of <term, LSN> pairs) of terms it is hard to
+determine the exact divergence point, and if we don't perform truncation at that
+point safety becomes questionable. Consider the following history, with
+safekeepers A, B, C, D, E. n_m means record created by proposer in term n with
+LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y.
+
+1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only
+on A.
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+</pre>
+
+2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+</pre>
+
+
+3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+
+Now, A gets back and P3 starts recovering it. How it should proceed? There are
+two options.
+
+## Don't try to find divergence point at all
+
+...start sending WAL conservatively since the horizon (1.1), and truncate
+obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is
+reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes.
+
+Then the following is possible:
+
+4) P3 moves one record 2.2 to A.
+
+<pre>
+A(t=1, e=1) 1.1 <b>2.2</b> 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and
+A's log is the longest one, they can elect P4 who will commit such log.
+
+Note that this particular history couldn't happen if we forbid to *create* new
+records in term n until majority of safekeepers switch to it. It would force CDE
+to switch to 2 before 2.2 is created, and A could never become donor while his
+log is corrupted. Generally with this additional barrier I believe the algorithm
+becomes safe, but
+ - I don't like this kind of artificial barrier;
+ - I also feel somewhat discomfortable about even temporary having intentionally
+   corrupted WAL;
+ - I'd still model check the idea.
+
+## Find divergence point and truncate at it
+
+Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we
+do that? Without term switching history we have to resort to sending again since
+the horizon and memcmp'ing records, which is inefficient and ugly. Or we can
+maintain full history and determine truncation point by comparing 'wrong' and
+'right' histories -- much like pg_rewind does -- and perform truncation + start
+streaming right there.
+
+# Proposal
+
+- Add term history as array of <term, LSN> pairs to safekeeper controlfile.
+- Return it to proposer with VoteResponse so 1) proposer can tell it to other
+  nodes and 2) determine personal streaming starting point. However, since we
+  don't append WAL and update controlfile atomically, let's first always update
+  controlfile but send only the history of what we really have (up to highest
+  term in history where begin_lsn >= end of wal; this highest term replaces
+  current `epoch`). We also send end of wal as we do now to determine the donor.
+- Create ProposerAnnouncement message which proposer sends before starting
+  streaming. It announces proposer as elected and
+  1) Truncates wrong part of WAL on safekeeper
+     (divergence point is already calculated at proposer, but can be
+     cross-verified here).
+  2) Communicates the 'right' history of its term (taken from donor). Seems
+     better to immediately put the history in the controlfile,
+	 though safekeeper might not have full WAL for previous terms in it --
+	 this way is simpler, and we can't update WAL and controlfile atomically anyway.
+
+	 This also constitutes analogue of current epoch bump for those safekeepers
+     which don't need recovery, which is important for sync-safekeepers (bump
+     epoch without waiting records from new term).
+- After ProposerAnnouncement proposer streams WAL since calculated starting
+  point -- only what is missing.
+
+
+pros/cons:
+ (more) clear safety of WAL truncation -- we get very close to Raft
+ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters
+   only for 5+ nodes)
+ adds some observability at safekeepers
+
+- complexity, but not that much
+
+
+# Misc
+
+- During model checking I did truncation on first locally non existent or
+  different record -- analogue of 'memcmp' variant described above.
--- a/docs/rfcs/014-storage-lsm.md
+++ b/docs/rfcs/014-storage-lsm.md
@@ -0,0 +1,142 @@
+# Why LSM trees?
+
+In general, an LSM tree has the nice property that random updates are
+fast, but the disk writes are sequential. When a new file is created,
+it is immutable. New files are created and old ones are deleted, but
+existing files are never modified. That fits well with storing the
+files on S3.
+
+Currently, we create a lot of small files. That is mostly a problem
+with S3, because each GET/PUT operation is expensive, and LIST
+operation only returns 1000 objects at a time, and isn't free
+either. Currently, the files are "archived" together into larger
+checkpoint files before they're uploaded to S3 to alleviate that
+problem, but garbage collecting data from the archive files would be
+difficult and we have not implemented it. This proposal addresses that
+problem.
+
+
+# Overview
+
+
+```
+^ LSN
+|
+|      Memtable:     +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|
+|            L0:     +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|
+|           L1:      +-------+ +-----+ +--+  +-+
+|                    |       | |     | |  |  | |
+|                    |       | |     | |  |  | |
+|                    +-------+ +-----+ +--+  +-+
+|
+|                       +----+ +-----+ +--+  +----+
+|                       |    | |     | |  |  |    |
+|                       |    | |     | |  |  |    |
+|                       +----+ +-----+ +--+  +----+
+|
+--------------------------------------------------------------> Page ID
+
+
+---+
+|   |   Layer file
+---+
+```
+
+
+# Memtable
+
+When new WAL arrives, it is first put into the Memtable. Despite the
+name, the Memtable is not a purely in-memory data structure. It can
+spill to a temporary file on disk if the system is low on memory, and
+is accessed through a buffer cache.
+
+If the page server crashes, the Memtable is lost. It is rebuilt by
+processing again the WAL that's newer than the latest layer in L0.
+
+The size of the Memtable is equal to the "checkpoint distance", or the
+amount of WAL that we need to keep in the safekeeper.
+
+# L0
+
+When the Memtable fills up, it is written out to a new file in L0. The
+files are immutable; when a file is created, it is never
+modified. Each file in L0 is roughly 1 GB in size (*). Like the
+Memtable, each file in L0 covers the whole key range.
+
+When enough files have been accumulated in L0, compaction
+starts. Compaction processes all the files in L0 and reshuffles the
+data to create a new set of files in L1.
+
+
+(*) except in corner cases like if we want to shut down the page
+server and want to flush out the memtable to disk even though it's not
+full yet.
+
+
+# L1
+
+L1 consists of ~ 1 GB files like L0. But each file covers only part of
+the overall key space, and a larger range of LSNs. This speeds up
+searches. When you're looking for a given page, you need to check all
+the files in L0, to see if they contain a page version for the requested
+page. But in L1, you only need to check the files whose key range covers
+the requested page. This is particularly important at cold start, when
+checking a file means downloading it from S3.
+
+Partitioning by key range also helps with garbage collection. If only a
+part of the database is updated, we will accumulate more files for
+the hot part in L1, and old files can be removed without affecting the
+cold part.
+
+
+# Image layers
+
+So far, we've only talked about delta layers. In addition to the delta
+layers, we create image layers, when "enough" WAL has been accumulated
+for some part of the database. Each image layer covers a 1 GB range of
+key space. It contains images of the pages at a single LSN, a snapshot
+if you will.
+
+The exact heuristic for what "enough" means is not clear yet. Maybe
+create a new image layer when 10 GB of WAL has been accumulated for a
+1 GB segment.
+
+The image layers limit the number of layers that a search needs to
+check. That put a cap on read latency, and it also allows garbage
+collecting layers that are older than the GC horizon.
+
+
+# Partitioning scheme
+
+When compaction happens and creates a new set of files in L1, how do
+we partition the data into the files? 
+
+- Goal is that each file is ~ 1 GB in size
+- Try to match partition boundaries at relation boundaries. (See [1]
+  for how PebblesDB does this, and for why that's important)
+- Greedy algorithm
+
+# Additional Reading
+
+[1] Paper on PebblesDB and how it does partitioning.
+https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf
--- a/docs/rfcs/README.md
+++ b/docs/rfcs/README.md
@@ -0,0 +1,95 @@
+This directory contains Request for Comments documents, or RFCs, for
+features or concepts that have been proposed. Alternative names:
+technical design doc, ERD, one-pager
+
+To make a new proposal, create a new text file in this directory and
+open a Pull Request with it. That gives others a chance and a forum
+to comment and discuss the design.
+
+When a feature is implemented and the code changes are committed, also
+include the corresponding RFC in this directory.
+
+Some of the RFCs in this directory have been implemented in some form
+or another, while others are on the roadmap, while still others are
+just obsolete and forgotten about. So read them with a grain of salt,
+but hopefully even the ones that don't reflect reality give useful
+context information.
+
+## What
+
+We use Tech Design RFC’s to summarize what we are planning to
+implement in our system. These RFCs should be created for large or not
+obvious technical tasks, e.g. changes of the architecture or bigger
+tasks that could take over a week, changes that touch multiple
+components or their interaction. RFCs should fit into a couple of
+pages, but could be longer on occasion.
+
+## Why
+
+We’re using RFCs to enable early review and collaboration, reduce
+uncertainties, risk and save time during the implementation phase that
+follows the Tech Design RFC.
+
+Tech Design RFCs also aim to avoid bus factor and are an additional
+measure to keep more peers up to date & familiar with our design and
+architecture.
+
+This is a crucial part for ensuring collaboration across timezones and
+setting up for success a distributed team that works on complex
+topics.
+
+## Prior art
+
+- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md)
+- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md)
+- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE)
+- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process)
+
+## How
+
+RFC lifecycle:
+
+- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body.
+- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC.
+- Add labels to the PR in the same manner as you do Issues. Example TBD
+- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code.
+- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach
+- RFCs stop evolving once the consensus is found or the proposal is implemented and merged.
+- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate.
+
+### RFC template
+
+Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration.
+
+```
+# Name
+Created on ..
+Implemented on ..
+
+## Summary
+
+## Motivation
+
+## Non Goals (if relevant)
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+## Proposed implementation
+
+### Reliability, failure modes and corner cases (if relevant)
+
+### Interaction/Sequence diagram (if relevant)
+
+### Scalability (if relevant)
+
+### Security implications (if relevant)
+
+### Unresolved questions (if relevant)
+
+## Alternative implementation (if relevant)
+
+## Pros/cons of proposed approaches (if relevant)
+
+## Definition of Done (if relevant)
+
+```
--- a/docs/rfcs/images/storage.jpeg
+++ b/docs/rfcs/images/storage.jpeg
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] }
 byteorder = "1.4.3"
 futures = "0.3.13"
 hyper = "0.14"
+itertools = "0.10.3"
 lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "3.0"
@@ -50,6 +51,7 @@ postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
+plotly = "0.7.0"

 [dev-dependencies]
 hex-literal = "0.3"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,6 +22,7 @@ use tar::{Builder, EntryType, Header};

 use crate::relish::*;
 use crate::repository::Timeline;
+use crate::DatadirTimelineImpl;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use zenith_utils::lsn::Lsn;
@@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn;
 /// used for constructing tarball.
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
-    timeline: &'a Arc<dyn Timeline>,
+    timeline: &'a Arc<DatadirTimelineImpl>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
 }
@@ -46,7 +47,7 @@ pub struct Basebackup<'a> {
 impl<'a> Basebackup<'a> {
    pub fn new(
        write: &'a mut dyn Write,
-        timeline: &'a Arc<dyn Timeline>,
+        timeline: &'a Arc<DatadirTimelineImpl>,
        req_lsn: Option<Lsn>,
    ) -> Result<Basebackup<'a>> {
        // Compute postgres doesn't have any previous WAL files, but the first
@@ -64,13 +65,13 @@ impl<'a> Basebackup<'a> {
        // prev_lsn to Lsn(0) if we cannot provide the correct value.
        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
-            timeline.wait_lsn(req_lsn)?;
+            timeline.tline.wait_lsn(req_lsn)?;

            // If the requested point is the end of the timeline, we can
            // provide prev_lsn. (get_last_record_rlsn() might return it as
            // zero, though, if no WAL has been generated on this timeline
            // yet.)
-            let end_of_timeline = timeline.get_last_record_rlsn();
+            let end_of_timeline = timeline.tline.get_last_record_rlsn();
            if req_lsn == end_of_timeline.last {
                (end_of_timeline.prev, req_lsn)
            } else {
@@ -78,7 +79,7 @@ impl<'a> Basebackup<'a> {
            }
        } else {
            // Backup was requested at end of the timeline.
-            let end_of_timeline = timeline.get_last_record_rlsn();
+            let end_of_timeline = timeline.tline.get_last_record_rlsn();
            (end_of_timeline.prev, end_of_timeline.last)
        };

@@ -115,21 +116,24 @@ impl<'a> Basebackup<'a> {
        }

        // Gather non-relational files from object storage pages.
-        for obj in self.timeline.list_nonrels(self.lsn)? {
-            match obj {
-                RelishTag::Slru { slru, segno } => {
-                    self.add_slru_segment(slru, segno)?;
-                }
-                RelishTag::FileNodeMap { spcnode, dbnode } => {
-                    self.add_relmap_file(spcnode, dbnode)?;
-                }
-                RelishTag::TwoPhase { xid } => {
-                    self.add_twophase_file(xid)?;
-                }
-                _ => {}
+        for kind in [
+            SlruKind::Clog,
+            SlruKind::MultiXactOffsets,
+            SlruKind::MultiXactMembers,
+        ] {
+            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
+                self.add_slru_segment(kind, segno)?;
            }
        }

+        // Create tablespace directories
+        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
+            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
+        }
+        for xid in self.timeline.list_twophase_files(self.lsn)? {
+            self.add_twophase_file(xid)?;
+        }
+
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
@@ -141,27 +145,14 @@ impl<'a> Basebackup<'a> {
    // Generate SLRU segment files from repository.
    //
    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let seg_size = self
-            .timeline
-            .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?;
-
-        if seg_size == None {
-            trace!(
-                "SLRU segment {}/{:>04X} was truncated",
-                slru.to_str(),
-                segno
-            );
-            return Ok(());
-        }
-
-        let nblocks = seg_size.unwrap();
+        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;

        let mut slru_buf: Vec<u8> =
            Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
        for blknum in 0..nblocks {
-            let img =
-                self.timeline
-                    .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
+            let img = self
+                .timeline
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
            assert!(img.len() == pg_constants::BLCKSZ as usize);

            slru_buf.extend_from_slice(&img);
@@ -176,16 +167,26 @@ impl<'a> Basebackup<'a> {
    }

    //
-    // Extract pg_filenode.map files from repository
-    // Along with them also send PG_VERSION for each database.
+    // Include database/tablespace directories.
    //
-    fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
-        let img = self.timeline.get_page_at_lsn(
-            RelishTag::FileNodeMap { spcnode, dbnode },
-            0,
-            self.lsn,
-        )?;
-        let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
+    // Each directory contains a PG_VERSION file, and the default database
+    // directories also contain pg_filenode.map files.
+    //
+    fn add_dbdir(
+        &mut self,
+        spcnode: u32,
+        dbnode: u32,
+        has_relmap_file: bool,
+    ) -> anyhow::Result<()> {
+        let relmap_img = if has_relmap_file {
+            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            assert!(img.len() == 512);
+            Some(img)
+        } else {
+            None
+        };
+
+        if spcnode == pg_constants::GLOBALTABLESPACE_OID {
            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
            let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
            self.ar.append(&header, version_bytes)?;
@@ -193,9 +194,32 @@ impl<'a> Basebackup<'a> {
            let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
            self.ar.append(&header, version_bytes)?;

-            String::from("global/pg_filenode.map") // filenode map for global tablespace
+            if let Some(img) = relmap_img {
+                // filenode map for global tablespace
+                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
+                self.ar.append(&header, &img[..])?;
+            } else {
+                warn!("global/pg_filenode.map is missing");
+            }
        } else {
-            // User defined tablespaces are not supported
+            // User defined tablespaces are not supported. However, as
+            // a special case, if a tablespace/db directory is
+            // completely empty, we can leave it out altogether. This
+            // makes taking a base backup after the 'tablespace'
+            // regression test pass, because the test drops the
+            // created tablespaces after the tests.
+            //
+            // FIXME: this wouldn't be necessary, if we handled
+            // XLOG_TBLSPC_DROP records. But we probably should just
+            // throw an error on CREATE TABLESPACE in the first place.
+            if !has_relmap_file
+                && self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn)?
+                    .is_empty()
+            {
+                return Ok(());
+            }
            assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);

            // Append dir path for each database
@@ -203,16 +227,17 @@ impl<'a> Basebackup<'a> {
            let header = new_tar_header_dir(&path)?;
            self.ar.append(&header, &mut io::empty())?;

-            let dst_path = format!("base/{}/PG_VERSION", dbnode);
-            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
-            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
-            self.ar.append(&header, version_bytes)?;
+            if let Some(img) = relmap_img {
+                let dst_path = format!("base/{}/PG_VERSION", dbnode);
+                let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+                let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
+                self.ar.append(&header, version_bytes)?;

-            format!("base/{}/pg_filenode.map", dbnode)
+                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
+                let header = new_tar_header(&relmap_path, img.len() as u64)?;
+                self.ar.append(&header, &img[..])?;
+            }
        };
-        assert!(img.len() == 512);
-        let header = new_tar_header(&path, img.len() as u64)?;
-        self.ar.append(&header, &img[..])?;
        Ok(())
    }

@@ -220,9 +245,7 @@ impl<'a> Basebackup<'a> {
    // Extract twophase state files
    //
    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self
-            .timeline
-            .get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
+        let img = self.timeline.get_twophase_file(xid, self.lsn)?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -242,11 +265,11 @@ impl<'a> Basebackup<'a> {
    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        let checkpoint_bytes = self
            .timeline
-            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
+            .get_checkpoint(self.lsn)
            .context("failed to get checkpoint bytes")?;
        let pg_control_bytes = self
            .timeline
-            .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
+            .get_control_file(self.lsn)
            .context("failed get control bytes")?;
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
@@ -267,7 +290,7 @@ impl<'a> Basebackup<'a> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.lsn == self.timeline.tline.get_ancestor_lsn() {
                write!(zenith_signal, "PREV LSN: none")?;
            } else {
                write!(zenith_signal, "PREV LSN: invalid")?;
--- a/pageserver/src/bin/psbench.rs
+++ b/pageserver/src/bin/psbench.rs
@@ -2,13 +2,14 @@
 //!
 //! Usually it's easier to write python perf tests, but here the performance
 //! of the tester matters, and the API is easier to work with from rust.
-use std::{collections::HashMap, io::{BufRead, BufReader, Cursor}, net::SocketAddr, ops::AddAssign};
+use std::{collections::HashMap, io::{BufRead, BufReader, Cursor}, net::SocketAddr, ops::AddAssign, time::Duration};
 use byteorder::ReadBytesExt;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use bytes::{BufMut, Bytes, BytesMut};
 use clap::{App, Arg};
 use std::fs::File;
 use zenith_utils::{GIT_VERSION, pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage}};
+use std::time::Instant;

 use anyhow::Result;

@@ -54,7 +55,6 @@ pub async fn get_page(
            102 => {
                let mut page = Vec::<u8>::new();
                cursor.read_to_end(&mut page).await?;
-                dbg!(page.len());
                if page.len() != 8 * 1024 {
                    panic!("Expected 8kb page, got: {:?}", page.len());
                }
@@ -206,16 +206,38 @@ async fn main() -> Result<()> {
    // TODO be mindful of caching, take multiple measurements, use monotonic time.
    // TODO make harder test case. More writes, fewer images.
    // TODO concurrent requests: multiple reads, also writes.
-    use std::time::Instant;
-    for (lsn, _pages) in writes_per_entry {
-        if lsn >= *first_update {
-            println!("Running get_page {:?} at {:?}", hottest_page, lsn);
+
+    // Do some warmup
+    let _page = get_page(&mut socket, &first_update, &hottest_page).await?;
+
+    let mut results: Vec<(Lsn, Duration)> = vec![];
+    for (i, (lsn, _pages)) in writes_per_entry.iter().enumerate() {
+        if lsn >= first_update {
+
+            // Just to speed up things
+            if i % 1000 != 0 {
+                continue
+            }
+
+            // println!("Running get_page {:?} at {:?}", hottest_page, lsn);
            let start = Instant::now();
            let _page = get_page(&mut socket, &lsn, &hottest_page).await?;
            let duration = start.elapsed();
-            println!("Time: {:?}", duration);
+            results.push((lsn.clone(), duration));
+            // println!("Time: {:?}", duration);
        }
    }
+    results.sort();
+    let x: Vec<_> = results.iter().map(|(lsn, _)| lsn.0).collect();
+    let y: Vec<_> = results.iter().map(|(_, duration)| duration.as_micros()).collect();
+
+    use plotly::{Plot, Scatter};
+    use plotly::common::Mode;
+    let get_page_trace = Scatter::new(x, y).name("get_page").mode(Mode::Lines);
+
+    let mut plot = Plot::new();
+    plot.add_trace(get_page_trace);
+    plot.show();

    Ok(())
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,8 @@ pub mod defaults {
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s";
+
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
    pub const DEFAULT_GC_PERIOD: &str = "100 s";
@@ -57,7 +58,7 @@ pub mod defaults {
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'

 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
-#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}'
+#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'

 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -91,7 +92,9 @@ pub struct PageServerConf {
    // This puts a backstop on how much WAL needs to be re-digested if the
    // page server crashes.
    pub checkpoint_distance: u64,
-    pub checkpoint_period: Duration,
+
+    // How often to check if there's compaction work to be done.
+    pub compaction_period: Duration,

    pub gc_horizon: u64,
    pub gc_period: Duration,
@@ -145,7 +148,8 @@ struct PageServerConfigBuilder {
    listen_http_addr: BuilderValue<String>,

    checkpoint_distance: BuilderValue<u64>,
-    checkpoint_period: BuilderValue<Duration>,
+
+    compaction_period: BuilderValue<Duration>,

    gc_horizon: BuilderValue<u64>,
    gc_period: BuilderValue<Duration>,
@@ -179,8 +183,8 @@ impl Default for PageServerConfigBuilder {
            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
            checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
-            checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)
-                .expect("cannot parse default checkpoint period")),
+            compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
+                .expect("cannot parse default compaction period")),
            gc_horizon: Set(DEFAULT_GC_HORIZON),
            gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period")),
@@ -216,8 +220,8 @@ impl PageServerConfigBuilder {
        self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
    }

-    pub fn checkpoint_period(&mut self, checkpoint_period: Duration) {
-        self.checkpoint_period = BuilderValue::Set(checkpoint_period)
+    pub fn compaction_period(&mut self, compaction_period: Duration) {
+        self.compaction_period = BuilderValue::Set(compaction_period)
    }

    pub fn gc_horizon(&mut self, gc_horizon: u64) {
@@ -286,9 +290,9 @@ impl PageServerConfigBuilder {
            checkpoint_distance: self
                .checkpoint_distance
                .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
-            checkpoint_period: self
-                .checkpoint_period
-                .ok_or(anyhow::anyhow!("missing checkpoint_period"))?,
+            compaction_period: self
+                .compaction_period
+                .ok_or(anyhow::anyhow!("missing compaction_period"))?,
            gc_horizon: self
                .gc_horizon
                .ok_or(anyhow::anyhow!("missing gc_horizon"))?,
@@ -425,7 +429,7 @@ impl PageServerConf {
                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
                "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
-                "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?),
+                "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
                "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
                "gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
@@ -561,7 +565,7 @@ impl PageServerConf {
        PageServerConf {
            id: ZNodeId(0),
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_period: Duration::from_secs(10),
+            compaction_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -631,7 +635,8 @@ listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'

 checkpoint_distance = 111 # in bytes
-checkpoint_period = '111 s'
+
+compaction_period = '111 s'

 gc_period = '222 s'
 gc_horizon = 222
@@ -668,7 +673,7 @@ id = 10
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
                checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-                checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?,
+                compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
                gc_horizon: defaults::DEFAULT_GC_HORIZON,
                gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
@@ -712,7 +717,7 @@ id = 10
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
                checkpoint_distance: 111,
-                checkpoint_period: Duration::from_secs(111),
+                compaction_period: Duration::from_secs(111),
                gc_horizon: 222,
                gc_period: Duration::from_secs(222),
                wait_lsn_timeout: Duration::from_secs(111),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -21,6 +21,7 @@ use zenith_utils::zid::{HexZTenantId, ZTimelineId};
 use super::models::{
    StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse,
 };
+use crate::repository::Repository;
 use crate::repository::RepositoryTimeline;
 use crate::timelines::TimelineInfo;
 use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
@@ -134,14 +135,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
        let _enter =
            info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
                .entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
        let include_non_incremental_logical_size =
            get_include_non_incremental_logical_size(&request);
-        Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline(
-            tenant_id,
-            repo.get_timeline(timeline_id)?,
-            include_non_incremental_logical_size,
-        ))
+        TimelineInfo::from_ids(tenant_id, timeline_id, include_non_incremental_logical_size)
    })
    .await
    .map_err(ApiError::from_err)?
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -11,14 +11,15 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use tracing::*;

+use crate::pgdatadir_mapping::*;
 use crate::relish::*;
-use crate::repository::*;
+use crate::repository::Repository;
 use crate::walingest::WalIngest;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::*;
 use postgres_ffi::xlog_utils::*;
-use postgres_ffi::Oid;
 use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
+use postgres_ffi::{Oid, TransactionId};
 use zenith_utils::lsn::Lsn;

 ///
@@ -27,42 +28,42 @@ use zenith_utils::lsn::Lsn;
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir(
+pub fn import_timeline_from_postgres_datadir<R: Repository>(
    path: &Path,
-    writer: &dyn TimelineWriter,
+    tline: &mut DatadirTimeline<R>,
    lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;

+    let mut writer = tline.begin_record(lsn);
+    writer.init_empty()?;
+
    // Scan 'global'
+    let mut relfiles: Vec<PathBuf> = Vec::new();
    for direntry in fs::read_dir(path.join("global"))? {
        let direntry = direntry?;
        match direntry.file_name().to_str() {
            None => continue,

            Some("pg_control") => {
-                pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
+                pg_control = Some(import_control_file(&mut writer, &direntry.path())?);
+            }
+            Some("pg_filenode.map") => {
+                import_relmap_file(
+                    &mut writer,
+                    pg_constants::GLOBALTABLESPACE_OID,
+                    0,
+                    &direntry.path(),
+                )?;
            }
-            Some("pg_filenode.map") => import_nonrel_file(
-                writer,
-                lsn,
-                RelishTag::FileNodeMap {
-                    spcnode: pg_constants::GLOBALTABLESPACE_OID,
-                    dbnode: 0,
-                },
-                &direntry.path(),
-            )?,

-            // Load any relation files into the page server
-            _ => import_relfile(
-                &direntry.path(),
-                writer,
-                lsn,
-                pg_constants::GLOBALTABLESPACE_OID,
-                0,
-            )?,
+            // Load any relation files into the page server (but only after the other files)
+            _ => relfiles.push(direntry.path()),
        }
    }
+    for relfile in relfiles {
+        import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?;
+    }

    // Scan 'base'. It contains database dirs, the database OID is the filename.
    // E.g. 'base/12345', where 12345 is the database OID.
@@ -76,54 +77,56 @@ pub fn import_timeline_from_postgres_datadir(

        let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;

+        let mut relfiles: Vec<PathBuf> = Vec::new();
        for direntry in fs::read_dir(direntry.path())? {
            let direntry = direntry?;
            match direntry.file_name().to_str() {
                None => continue,

-                Some("PG_VERSION") => continue,
-                Some("pg_filenode.map") => import_nonrel_file(
-                    writer,
-                    lsn,
-                    RelishTag::FileNodeMap {
-                        spcnode: pg_constants::DEFAULTTABLESPACE_OID,
-                        dbnode: dboid,
-                    },
+                Some("PG_VERSION") => {
+                    //writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
+                }
+                Some("pg_filenode.map") => import_relmap_file(
+                    &mut writer,
+                    pg_constants::DEFAULTTABLESPACE_OID,
+                    dboid,
                    &direntry.path(),
                )?,

                // Load any relation files into the page server
-                _ => import_relfile(
-                    &direntry.path(),
-                    writer,
-                    lsn,
-                    pg_constants::DEFAULTTABLESPACE_OID,
-                    dboid,
-                )?,
+                _ => relfiles.push(direntry.path()),
            }
        }
+        for relfile in relfiles {
+            import_relfile(
+                &mut writer,
+                &relfile,
+                pg_constants::DEFAULTTABLESPACE_OID,
+                dboid,
+            )?;
+        }
    }
    for entry in fs::read_dir(path.join("pg_xact"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
+        import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
+        import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
+        import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
-        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
+        import_twophase_file(&mut writer, xid, &entry.path())?;
    }
    // TODO: Scan pg_tblspc

    // We're done importing all the data files.
-    writer.advance_last_record_lsn(lsn);
+    writer.finish()?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -141,7 +144,7 @@ pub fn import_timeline_from_postgres_datadir(
    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
    import_wal(
        &path.join("pg_wal"),
-        writer,
+        tline,
        Lsn(pg_control.checkPointCopy.redo),
        lsn,
    )?;
@@ -150,10 +153,9 @@ pub fn import_timeline_from_postgres_datadir(
 }

 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_relfile(
+fn import_relfile<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
    path: &Path,
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
    spcoid: Oid,
    dboid: Oid,
 ) -> Result<()> {
@@ -170,26 +172,35 @@ fn import_relfile(
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];

+    let len = file.metadata().unwrap().len();
+    ensure!(len % pg_constants::BLCKSZ as u64 == 0);
+    let nblocks = len / pg_constants::BLCKSZ as u64;
+
+    if segno != 0 {
+        todo!();
+    }
+
+    let rel = RelTag {
+        spcnode: spcoid,
+        dbnode: dboid,
+        relnode,
+        forknum,
+    };
+    timeline.put_rel_creation(rel, nblocks as u32)?;
+
    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
    loop {
        let r = file.read_exact(&mut buf);
        match r {
            Ok(_) => {
-                let rel = RelTag {
-                    spcnode: spcoid,
-                    dbnode: dboid,
-                    relnode,
-                    forknum,
-                };
-                let tag = RelishTag::Relation(rel);
-                timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?;
+                timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
            Err(err) => match err.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
-                    // FIXME: maybe check that we read the full length of the file?
+                    ensure!(blknum == nblocks as u32, "unexpected EOF");
                    break;
                }
                _ => {
@@ -203,16 +214,28 @@ fn import_relfile(
    Ok(())
 }

-///
-/// Import a "non-blocky" file into the repository
-///
-/// This is used for small files like the control file, twophase files etc. that
-/// are just slurped into the repository as one blob.
-///
-fn import_nonrel_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    tag: RelishTag,
+/// Import a relmapper (pg_filenode.map) file into the repository
+fn import_relmap_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    spcnode: Oid,
+    dbnode: Oid,
+    path: &Path,
+) -> Result<()> {
+    let mut file = File::open(path)?;
+    let mut buffer = Vec::new();
+    // read the whole file
+    file.read_to_end(&mut buffer)?;
+
+    trace!("importing relmap file {}", path.display());
+
+    timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
+    Ok(())
+}
+
+/// Import a twophase state file (pg_twophase/<xid>) into the repository
+fn import_twophase_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    xid: TransactionId,
    path: &Path,
 ) -> Result<()> {
    let mut file = File::open(path)?;
@@ -222,7 +245,7 @@ fn import_nonrel_file(

    trace!("importing non-rel file {}", path.display());

-    timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
+    timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
 }

@@ -231,9 +254,8 @@ fn import_nonrel_file(
 ///
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
-fn import_control_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
+fn import_control_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
    path: &Path,
 ) -> Result<ControlFileData> {
    let mut file = File::open(path)?;
@@ -244,17 +266,12 @@ fn import_control_file(
    trace!("importing control file {}", path.display());

    // Import it as ControlFile
-    timeline.put_page_image(
-        RelishTag::ControlFile,
-        0,
-        lsn,
-        Bytes::copy_from_slice(&buffer[..]),
-    )?;
+    timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;

    // Extract the checkpoint record and import it separately.
    let pg_control = ControlFileData::decode(&buffer)?;
    let checkpoint_bytes = pg_control.checkPointCopy.encode();
-    timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
+    timeline.put_checkpoint(checkpoint_bytes)?;

    Ok(pg_control)
 }
@@ -262,37 +279,38 @@ fn import_control_file(
 ///
 /// Import an SLRU segment file
 ///
-fn import_slru_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
+fn import_slru_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
    slru: SlruKind,
    path: &Path,
 ) -> Result<()> {
-    // Does it look like an SLRU file?
+    trace!("importing slru file {}", path.display());
+
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;

-    trace!("importing slru file {}", path.display());
+    let len = file.metadata().unwrap().len();
+    ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
+    let nblocks = len / pg_constants::BLCKSZ as u64;
+
+    ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
+
+    timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?;

    let mut rpageno = 0;
    loop {
        let r = file.read_exact(&mut buf);
        match r {
            Ok(_) => {
-                timeline.put_page_image(
-                    RelishTag::Slru { slru, segno },
-                    rpageno,
-                    lsn,
-                    Bytes::copy_from_slice(&buf),
-                )?;
+                timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
            Err(err) => match err.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
-                    // FIXME: maybe check that we read the full length of the file?
+                    ensure!(rpageno == nblocks as u32, "unexpected EOF");
                    break;
                }
                _ => {
@@ -301,8 +319,6 @@ fn import_slru_file(
            },
        };
        rpageno += 1;
-
-        // TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages
    }

    Ok(())
@@ -310,9 +326,9 @@ fn import_slru_file(

 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(
+fn import_wal<R: Repository>(
    walpath: &Path,
-    writer: &dyn TimelineWriter,
+    tline: &mut DatadirTimeline<R>,
    startpoint: Lsn,
    endpoint: Lsn,
 ) -> Result<()> {
@@ -322,7 +338,7 @@ fn import_wal(
    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
    let mut last_lsn = startpoint;

-    let mut walingest = WalIngest::new(writer.deref(), startpoint)?;
+    let mut walingest = WalIngest::new(tline, startpoint)?;

    while last_lsn <= endpoint {
        // FIXME: assume postgresql tli 1 for now
@@ -355,7 +371,7 @@ fn import_wal(
        let mut nrecords = 0;
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(writer, recdata, lsn)?;
+                walingest.ingest_record(tline, recdata, lsn)?;
                last_lsn = lsn;

                nrecords += 1;
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -0,0 +1,134 @@
+use crate::repository::{key_range_size, singleton_range, Key};
+use postgres_ffi::pg_constants;
+use std::ops::Range;
+
+// Target file size, when creating image and delta layers
+pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB
+
+///
+/// Represents a set of Keys, in a compact form.
+///
+#[derive(Clone, Debug)]
+pub struct KeySpace {
+    // Contiguous ranges of keys that belong to the key space. In key order, and
+    // with no overlap.
+    pub ranges: Vec<Range<Key>>,
+}
+
+impl KeySpace {
+    ///
+    /// Partition a key space into roughly chunks of roughly 'target_size' bytes in
+    /// each patition.
+    ///
+    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+        // Assume that each value is 8k in size.
+        let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
+
+        let mut parts = Vec::new();
+        let mut current_part = Vec::new();
+        let mut current_part_size: usize = 0;
+        for range in &self.ranges {
+            // If appending the next contiguous range in the keyspace to the current
+            // partition would cause it to be too large, start a new partition.
+            let this_size = key_range_size(range) as usize;
+            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
+                parts.push(KeySpace {
+                    ranges: current_part,
+                });
+                current_part = Vec::new();
+                current_part_size = 0;
+            }
+
+            // If the next range is larger than 'target_size', split it into
+            // 'target_size' chunks.
+            let mut remain_size = this_size;
+            let mut start = range.start;
+            while remain_size > target_nblocks {
+                let next = start.add(target_nblocks as u32);
+                parts.push(KeySpace {
+                    ranges: vec![start..next],
+                });
+                start = next;
+                remain_size -= target_nblocks
+            }
+            current_part.push(start..range.end);
+            current_part_size += remain_size;
+        }
+
+        // add last partition that wasn't full yet.
+        if !current_part.is_empty() {
+            parts.push(KeySpace {
+                ranges: current_part,
+            });
+        }
+
+        KeyPartitioning { parts }
+    }
+}
+
+///
+/// Represents a partitioning of the key space.
+///
+/// The only kind of partitioning we do is to partition the key space into
+/// partitions that are roughly equal in physical size (see KeySpace::partition).
+/// But this data structure could represent any partitioning.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeyPartitioning {
+    pub parts: Vec<KeySpace>,
+}
+
+impl KeyPartitioning {
+    pub fn new() -> Self {
+        KeyPartitioning { parts: Vec::new() }
+    }
+}
+
+///
+/// A helper object, to collect a set of keys and key ranges into a KeySpace
+/// object. This takes care of merging adjacent keys and key ranges into
+/// contiguous ranges.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeySpaceAccum {
+    accum: Option<Range<Key>>,
+
+    ranges: Vec<Range<Key>>,
+}
+
+impl KeySpaceAccum {
+    pub fn new() -> Self {
+        Self {
+            accum: None,
+            ranges: Vec::new(),
+        }
+    }
+
+    pub fn add_key(&mut self, key: Key) {
+        self.add_range(singleton_range(key))
+    }
+
+    pub fn add_range(&mut self, range: Range<Key>) {
+        match self.accum.as_mut() {
+            Some(accum) => {
+                if range.start == accum.end {
+                    accum.end = range.end;
+                } else {
+                    assert!(range.start > accum.end);
+                    self.ranges.push(accum.clone());
+                    *accum = range;
+                }
+            }
+            None => self.accum = Some(range),
+        }
+    }
+
+    pub fn to_keyspace(mut self) -> KeySpace {
+        if let Some(accum) = self.accum.take() {
+            self.ranges.push(accum);
+        }
+        KeySpace {
+            ranges: self.ranges,
+        }
+    }
+}
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -1,6 +1,5 @@
-//!
 //! A DeltaLayer represents a collection of WAL records or page images in a range of
-//! LSNs, for one segment. It is stored on a file on disk.
+//! LSNs, and in a range of Keys. It is stored on a file on disk.
 //!
 //! Usually a delta layer only contains differences - in the form of WAL records against
 //! a base LSN. However, if a segment is newly created, by creating a new relation or
@@ -11,84 +10,76 @@
 //! can happen when you create a new branch in the middle of a delta layer, and the WAL
 //! records on the new branch are put in a new delta layer.
 //!
-//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters
+//! When a delta file needs to be accessed, we slurp the 'index' metadata
 //! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
-//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN.
-//! The byte ranges in the metadata can be used to find the page/WAL record in
-//! PAGE_VERSIONS_CHAPTER.
+//! To access a particular value, we search `index` for the given key.
+//! The byte offset in the index can be used to find the value in
+//! VALUES_CHAPTER.
 //!
 //! On disk, the delta files are stored in timelines/<timelineid> directory.
 //! Currently, there are no subdirectories, and each delta file is named like this:
 //!
-//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
+//!    <key start>-<key end>__<start LSN>-<end LSN
 //!
 //! For example:
 //!
-//!    1663_13990_2609_0_5_000000000169C348_000000000169C349
+//!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //!
-//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
-//! So the above example would become:
 //!
-//!    1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED
-//!
-//! The end LSN indicates when it was dropped in that case, we don't store it in the
-//! file contents in any way.
-//!
-//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
-//! parts: the page versions and the segment sizes. They are stored as separate chapters.
+//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
+//! parts: the 'index', the values, and a short summary header. They are stored as
+//! separate chapters.
 //!
 use crate::config::PageServerConf;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
-    RELISH_SEG_SIZE,
+    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
 };
+use crate::repository::{Key, Value};
 use crate::virtual_file::VirtualFile;
 use crate::walrecord;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, Result};
 use log::*;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use zenith_utils::vec_map::VecMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
-use std::io::{BufWriter, Write};
-use std::ops::Bound::Included;
+use std::io::BufWriter;
+use std::io::Write;
+use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{Mutex, MutexGuard};
+use std::sync::{RwLock, RwLockReadGuard};

-use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter};
+use bookfile::{Book, BookWriter, ChapterWriter};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

 // Magic constant to identify a Zenith delta file
-pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
+pub const DELTA_FILE_MAGIC: u32 = 0x5A616E11;
+
+/// Mapping from (key, lsn) -> page/WAL record
+/// byte ranges in VALUES_CHAPTER
+static INDEX_CHAPTER: u64 = 1;

-/// Mapping from (block #, lsn) -> page/WAL record
-/// byte ranges in PAGE_VERSIONS_CHAPTER
-static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
 /// Page/WAL bytes - cannot be interpreted
-/// without PAGE_VERSION_METAS_CHAPTER
-static PAGE_VERSIONS_CHAPTER: u64 = 2;
-static SEG_SIZES_CHAPTER: u64 = 3;
+/// without the page versions from the INDEX_CHAPTER
+static VALUES_CHAPTER: u64 = 2;

 /// Contains the [`Summary`] struct
-static SUMMARY_CHAPTER: u64 = 4;
+static SUMMARY_CHAPTER: u64 = 3;

 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct Summary {
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,
-
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    dropped: bool,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
 }

 impl From<&DeltaLayer> for Summary {
@@ -96,33 +87,17 @@ impl From<&DeltaLayer> for Summary {
        Self {
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
-            seg: layer.seg,
-
-            start_lsn: layer.start_lsn,
-            end_lsn: layer.end_lsn,
-
-            dropped: layer.dropped,
+            key_range: layer.key_range.clone(),
+            lsn_range: layer.lsn_range.clone(),
        }
    }
 }

-#[derive(Serialize, Deserialize)]
-struct BlobRange {
-    offset: u64,
-    size: usize,
-}
-
-fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
-    let mut buf = vec![0u8; range.size];
-    reader.read_exact_at(&mut buf, range.offset)?;
-    Ok(buf)
-}
-
 ///
 /// DeltaLayer is the in-memory data structure associated with an
 /// on-disk delta file.  We keep a DeltaLayer in memory for each
 /// file, in the LayerMap. If a layer is in "loaded" state, we have a
-/// copy of the file in memory, in 'inner'. Otherwise the struct is
+/// copy of the index in memory, in 'inner'. Otherwise the struct is
 /// just a placeholder for a file that exists on disk, and it needs to
 /// be loaded before using it in queries.
 ///
@@ -131,47 +106,24 @@ pub struct DeltaLayer {

    pub tenantid: ZTenantId,
    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,

-    //
-    // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
-    // start is inclusive, and end is exclusive.
-    //
-    pub start_lsn: Lsn,
-    pub end_lsn: Lsn,
-
-    dropped: bool,
-
-    inner: Mutex<DeltaLayerInner>,
+    inner: RwLock<DeltaLayerInner>,
 }

 pub struct DeltaLayerInner {
-    /// If false, the 'page_version_metas' and 'seg_sizes' have not been
-    /// loaded into memory yet.
+    /// If false, the 'index' has not been loaded into memory yet.
    loaded: bool,

+    ///
+    /// All versions of all pages in the layer are kept here.
+    /// Indexed by block number and LSN. The value is an offset into the
+    /// chapter where the page version is stored.
+    ///
+    index: HashMap<Key, VecMap<Lsn, BlobRef>>,
+
    book: Option<Book<VirtualFile>>,
-
-    /// All versions of all pages in the file are are kept here.
-    /// Indexed by block number and LSN.
-    page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
-
-    /// `seg_sizes` tracks the size of the segment at different points in time.
-    seg_sizes: VecMap<Lsn, SegmentBlk>,
-}
-
-impl DeltaLayerInner {
-    fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
-        // Scan the VecMap backwards, starting from the given entry.
-        let slice = self
-            .seg_sizes
-            .slice_range((Included(&Lsn(0)), Included(&lsn)));
-        if let Some((_entry_lsn, entry)) = slice.last() {
-            Ok(*entry)
-        } else {
-            bail!("could not find seg size in delta layer")
-        }
-    }
 }

 impl Layer for DeltaLayer {
@@ -183,132 +135,93 @@ impl Layer for DeltaLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
    }

-    fn is_dropped(&self) -> bool {
-        self.dropped
-    }
-
-    fn get_start_lsn(&self) -> Lsn {
-        self.start_lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
-        self.end_lsn
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
    }

    fn filename(&self) -> PathBuf {
        PathBuf::from(self.layer_name().to_string())
    }

-    /// Look up given page in the cache.
-    fn get_page_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
-        blknum: SegmentBlk,
-        lsn: Lsn,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
        let mut need_image = true;

-        assert!((0..RELISH_SEG_SIZE).contains(&blknum));
-
-        match &reconstruct_data.page_img {
-            Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => {
-                return Ok(PageReconstructResult::Complete)
-            }
-            _ => {}
-        }
+        assert!(self.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
            let inner = self.load()?;
-            let page_version_reader = inner
+            let values_reader = inner
                .book
                .as_ref()
                .expect("should be loaded in load call above")
-                .chapter_reader(PAGE_VERSIONS_CHAPTER)?;
+                .chapter_reader(VALUES_CHAPTER)?;

-            // Scan the metadata VecMap backwards, starting from the given entry.
-            let minkey = (blknum, Lsn(0));
-            let maxkey = (blknum, lsn);
-            let iter = inner
-                .page_version_metas
-                .slice_range((Included(&minkey), Included(&maxkey)))
-                .iter()
-                .rev();
-            for ((_blknum, pv_lsn), blob_range) in iter {
-                match &reconstruct_data.page_img {
-                    Some((cached_lsn, _)) if pv_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Complete)
-                    }
-                    _ => {}
-                }
-
-                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
-
-                match pv {
-                    PageVersion::Page(img) => {
-                        // Found a page image, return it
-                        reconstruct_data.page_img = Some((*pv_lsn, img));
-                        need_image = false;
+            // Scan the page versions backwards, starting from `lsn`.
+            if let Some(vec_map) = inner.index.get(&key) {
+                let slice = vec_map.slice_range(lsn_range);
+                let mut size = 0usize;
+                let mut first_pos = 0u64;
+                for (_entry_lsn, blob_ref) in slice.iter().rev() {
+                    size += blob_ref.size();
+                    first_pos = blob_ref.pos();
+                    if blob_ref.will_init() {
                        break;
                    }
-                    PageVersion::Wal(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_data.records.push((*pv_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
+                }
+                if size != 0 {
+                    let mut buf = vec![0u8; size];
+                    values_reader.read_exact_at(&mut buf, first_pos)?;
+                    for (entry_lsn, blob_ref) in slice.iter().rev() {
+                        let offs = (blob_ref.pos() - first_pos) as usize;
+                        let val = Value::des(&buf[offs..offs + blob_ref.size()])?;
+                        match val {
+                            Value::Image(img) => {
+                                reconstruct_state.img = Some((*entry_lsn, img));
+                                need_image = false;
+                                break;
+                            }
+                            Value::WalRecord(rec) => {
+                                let will_init = rec.will_init();
+                                reconstruct_state.records.push((*entry_lsn, rec));
+                                if will_init {
+                                    // This WAL record initializes the page, so no need to go further back
+                                    need_image = false;
+                                    break;
+                                }
+                            }
                        }
                    }
                }
            }
-
-            // If we didn't find any records for this, check if the request is beyond EOF
-            if need_image
-                && reconstruct_data.records.is_empty()
-                && self.seg.rel.is_blocky()
-                && blknum >= inner.get_seg_size(lsn)?
-            {
-                return Ok(PageReconstructResult::Missing(self.start_lsn));
-            }
-
            // release metadata lock and close the file
        }

        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok(PageReconstructResult::Complete)
+            Ok(ValueReconstructResult::Complete)
        }
    }

-    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
-        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
+        let inner = self.load().unwrap();

-        let inner = self.load()?;
-        inner.get_seg_size(lsn)
-    }
-
-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
-        // Is the requested LSN after the rel was dropped?
-        if self.dropped && lsn >= self.end_lsn {
-            return Ok(false);
+        match DeltaValueIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(err) => Box::new(std::iter::once(Err(err))),
        }
-
-        // Otherwise, it exists.
-        Ok(true)
    }

    ///
@@ -316,14 +229,24 @@ impl Layer for DeltaLayer {
    /// it will need to be loaded back.
    ///
    fn unload(&self) -> Result<()> {
-        let mut inner = self.inner.lock().unwrap();
-        inner.page_version_metas = VecMap::default();
-        inner.seg_sizes = VecMap::default();
-        inner.loaded = false;
+        // FIXME: In debug mode, loading and unloading the index slows
+        // things down so much that you get timeout errors. At least
+        // with the test_parallel_copy test. So as an even more ad hoc
+        // stopgap fix for that, only unload every on average 10
+        // checkpoint cycles.
+        use rand::RngCore;
+        if rand::thread_rng().next_u32() > (u32::MAX / 10) {
+            return Ok(());
+        }

-        // Note: we keep the Book open. Is that a good idea? The virtual file
-        // machinery has its own rules for closing the file descriptor if it's not
-        // needed, but the Book struct uses up some memory, too.
+        if let Ok(mut inner) = self.inner.try_write() {
+            inner.index = HashMap::default();
+            inner.loaded = false;
+
+            // Note: we keep the Book open. Is that a good idea? The virtual file
+            // machinery has its own rules for closing the file descriptor if it's not
+            // needed, but the Book struct uses up some memory, too.
+        }

        Ok(())
    }
@@ -345,45 +268,52 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} seg {} {}-{} ----",
-            self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
        );

-        println!("--- seg sizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.seg_sizes.as_slice() {
-            println!("  {}: {}", k, v);
-        }
-        println!("--- page versions ---");

        let path = self.path();
        let file = std::fs::File::open(&path)?;
        let book = Book::new(file)?;
+        let chapter = book.chapter_reader(VALUES_CHAPTER)?;

-        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
-            let mut desc = String::new();
+        let mut values: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
+        values.sort_by_key(|k| k.0);

-            let buf = read_blob(&chapter, blob_range)?;
-            let pv = PageVersion::des(&buf)?;
+        for (key, versions) in values {
+            for (lsn, blob_ref) in versions.as_slice() {
+                let mut desc = String::new();
+                let mut buf = vec![0u8; blob_ref.size()];
+                chapter.read_exact_at(&mut buf, blob_ref.pos())?;
+                let val = Value::des(&buf);

-            match pv {
-                PageVersion::Page(img) => {
-                    write!(&mut desc, " img {} bytes", img.len())?;
-                }
-                PageVersion::Wal(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec);
-                    write!(
-                        &mut desc,
-                        " rec {} bytes will_init: {} {}",
-                        blob_range.size,
-                        rec.will_init(),
-                        wal_desc
-                    )?;
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec);
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                    }
                }
+                println!("  key {} at {}: {}", key, lsn, desc);
            }
-
-            println!("  blk {} at {}: {}", blk, lsn, desc);
        }

        Ok(())
@@ -408,61 +338,61 @@ impl DeltaLayer {
    ///
    /// Load the contents of the file into memory
    ///
-    fn load(&self) -> Result<MutexGuard<DeltaLayerInner>> {
-        // quick exit if already loaded
-        let mut inner = self.inner.lock().unwrap();
+    fn load(&self) -> Result<RwLockReadGuard<DeltaLayerInner>> {
+        loop {
+            // quick exit if already loaded
+            {
+                let inner = self.inner.read().unwrap();

-        if inner.loaded {
-            return Ok(inner);
-        }
-
-        let path = self.path();
-
-        // Open the file if it's not open already.
-        if inner.book.is_none() {
-            let file = VirtualFile::open(&path)?;
-            inner.book = Some(Book::new(file)?);
-        }
-        let book = inner.book.as_ref().unwrap();
-
-        match &self.path_or_conf {
-            PathOrConf::Conf(_) => {
-                let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-                let actual_summary = Summary::des(&chapter)?;
-
-                let expected_summary = Summary::from(self);
-
-                if actual_summary != expected_summary {
-                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                if inner.loaded {
+                    return Ok(inner);
                }
            }
-            PathOrConf::Path(path) => {
-                let actual_filename = Path::new(path.file_name().unwrap());
-                let expected_filename = self.filename();
+            // need to upgrade to write lock
+            let mut inner = self.inner.write().unwrap();

-                if actual_filename != expected_filename {
-                    println!(
-                        "warning: filename does not match what is expected from in-file summary"
-                    );
-                    println!("actual: {:?}", actual_filename);
-                    println!("expected: {:?}", expected_filename);
+            let path = self.path();
+
+            // Open the file if it's not open already.
+            if inner.book.is_none() {
+                let file = VirtualFile::open(&path)?;
+                inner.book = Some(Book::new(file)?);
+            }
+            let book = inner.book.as_ref().unwrap();
+
+            match &self.path_or_conf {
+                PathOrConf::Conf(_) => {
+                    let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
+                    let actual_summary = Summary::des(&chapter)?;
+
+                    let expected_summary = Summary::from(self);
+
+                    if actual_summary != expected_summary {
+                        bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                    }
+                }
+                PathOrConf::Path(path) => {
+                    let actual_filename = Path::new(path.file_name().unwrap());
+                    let expected_filename = self.filename();
+
+                    if actual_filename != expected_filename {
+                        println!(
+                            "warning: filename does not match what is expected from in-file summary"
+                        );
+                        println!("actual: {:?}", actual_filename);
+                        println!("expected: {:?}", expected_filename);
+                    }
                }
            }
+
+            let chapter = book.read_chapter(INDEX_CHAPTER)?;
+            let index = HashMap::des(&chapter)?;
+
+            debug!("loaded from {}", &path.display());
+
+            inner.index = index;
+            inner.loaded = true;
        }
-
-        let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
-        let page_version_metas = VecMap::des(&chapter)?;
-
-        let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?;
-        let seg_sizes = VecMap::des(&chapter)?;
-
-        debug!("loaded from {}", &path.display());
-
-        inner.page_version_metas = page_version_metas;
-        inner.seg_sizes = seg_sizes;
-        inner.loaded = true;
-
-        Ok(inner)
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -476,15 +406,12 @@ impl DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
-            start_lsn: filename.start_lsn,
-            end_lsn: filename.end_lsn,
-            dropped: filename.dropped,
-            inner: Mutex::new(DeltaLayerInner {
+            key_range: filename.key_range.clone(),
+            lsn_range: filename.lsn_range.clone(),
+            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                book: None,
-                page_version_metas: VecMap::default(),
-                seg_sizes: VecMap::default(),
+                index: HashMap::default(),
            }),
        }
    }
@@ -494,7 +421,7 @@ impl DeltaLayer {
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
    where
-        F: std::os::unix::prelude::FileExt,
+        F: FileExt,
    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;
@@ -503,25 +430,20 @@ impl DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timelineid: summary.timelineid,
            tenantid: summary.tenantid,
-            seg: summary.seg,
-            start_lsn: summary.start_lsn,
-            end_lsn: summary.end_lsn,
-            dropped: summary.dropped,
-            inner: Mutex::new(DeltaLayerInner {
+            key_range: summary.key_range,
+            lsn_range: summary.lsn_range,
+            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                book: None,
-                page_version_metas: VecMap::default(),
-                seg_sizes: VecMap::default(),
+                index: HashMap::default(),
            }),
        })
    }

    fn layer_name(&self) -> DeltaFileName {
        DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-            dropped: self.dropped,
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
        }
    }

@@ -542,24 +464,24 @@ impl DeltaLayer {
 ///
 /// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
 ///
-/// 2. Write the contents by calling `put_page_version` for every page
+/// 2. Write the contents by calling `put_value` for every page
 ///    version to store in the layer.
 ///
 /// 3. Call `finish`.
 ///
 pub struct DeltaLayerWriter {
    conf: &'static PageServerConf,
+    path: PathBuf,
    timelineid: ZTimelineId,
    tenantid: ZTenantId,
-    seg: SegmentTag,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-    dropped: bool,

-    page_version_writer: ChapterWriter<BufWriter<VirtualFile>>,
-    pv_offset: u64,
+    key_start: Key,
+    lsn_range: Range<Lsn>,

-    page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
+    index: HashMap<Key, VecMap<Lsn, BlobRef>>,
+
+    values_writer: ChapterWriter<BufWriter<VirtualFile>>,
+    end_offset: u64,
 }

 impl DeltaLayerWriter {
@@ -570,94 +492,88 @@ impl DeltaLayerWriter {
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
-        seg: SegmentTag,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        dropped: bool,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
    ) -> Result<DeltaLayerWriter> {
        // Create the file
        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = DeltaLayer::path_for(
-            &PathOrConf::Conf(conf),
-            timelineid,
-            tenantid,
-            &DeltaFileName {
-                seg,
-                start_lsn,
-                end_lsn,
-                dropped,
-            },
-        );
+
+        let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
+            "{}-XXX__{:016X}-{:016X}.temp",
+            key_start,
+            u64::from(lsn_range.start),
+            u64::from(lsn_range.end)
+        ));
+        info!("temp deltalayer path {}", path.display());
        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;

        // Open the page-versions chapter for writing. The calls to
-        // `put_page_version` will use this to write the contents.
-        let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER);
+        // `put_value` will use this to write the contents.
+        let values_writer = book.new_chapter(VALUES_CHAPTER);

        Ok(DeltaLayerWriter {
            conf,
+            path,
            timelineid,
            tenantid,
-            seg,
-            start_lsn,
-            end_lsn,
-            dropped,
-            page_version_writer,
-            page_version_metas: VecMap::default(),
-            pv_offset: 0,
+            key_start,
+            lsn_range,
+            index: HashMap::new(),
+            values_writer,
+            end_offset: 0,
        })
    }

    ///
-    /// Append a page version to the file.
+    /// Append a key-value pair to the file.
    ///
-    /// 'buf' is a serialized PageVersion.
-    /// The page versions must be appended in blknum, lsn order.
+    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> {
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+        //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
+        assert!(self.lsn_range.start <= lsn);
        // Remember the offset and size metadata. The metadata is written
        // to a separate chapter, in `finish`.
-        let blob_range = BlobRange {
-            offset: self.pv_offset,
-            size: buf.len(),
-        };
-        self.page_version_metas
-            .append((blknum, lsn), blob_range)
-            .unwrap();
-
-        // write the page version
-        self.page_version_writer.write_all(buf)?;
-        self.pv_offset += buf.len() as u64;
+        let off = self.end_offset;
+        let buf = Value::ser(&val)?;
+        let len = buf.len();
+        self.values_writer.write_all(&buf)?;
+        self.end_offset += len as u64;
+        let vec_map = self.index.entry(key).or_default();
+        let blob_ref = BlobRef::new(off, len, val.will_init());
+        let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            bail!(
+                "Value for {} at {} already exists in delta layer being built",
+                key,
+                lsn
+            );
+        }

        Ok(())
    }

+    pub fn size(&self) -> u64 {
+        self.end_offset
+    }
+
    ///
    /// Finish writing the delta layer.
    ///
    /// 'seg_sizes' is a list of size changes to store with the actual data.
    ///
-    pub fn finish(self, seg_sizes: VecMap<Lsn, SegmentBlk>) -> Result<DeltaLayer> {
-        // Close the page-versions chapter
-        let book = self.page_version_writer.close()?;
+    pub fn finish(self, key_end: Key) -> Result<DeltaLayer> {
+        // Close the values chapter
+        let book = self.values_writer.close()?;

-        // Write out page versions metadata
-        let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
-        let buf = VecMap::ser(&self.page_version_metas)?;
-        chapter.write_all(&buf)?;
-        let book = chapter.close()?;
-
-        if self.seg.rel.is_blocky() {
-            assert!(!seg_sizes.is_empty());
-        }
-
-        // and seg_sizes to separate chapter
-        let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
-        let buf = VecMap::ser(&seg_sizes)?;
+        // Write out the index
+        let mut chapter = book.new_chapter(INDEX_CHAPTER);
+        let buf = HashMap::ser(&self.index)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

@@ -665,12 +581,8 @@ impl DeltaLayerWriter {
        let summary = Summary {
            tenantid: self.tenantid,
            timelineid: self.timelineid,
-            seg: self.seg,
-
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-
-            dropped: self.dropped,
+            key_range: self.key_start..key_end,
+            lsn_range: self.lsn_range.clone(),
        };
        Summary::ser_into(&summary, &mut chapter)?;
        let book = chapter.close()?;
@@ -685,20 +597,111 @@ impl DeltaLayerWriter {
            path_or_conf: PathOrConf::Conf(self.conf),
            tenantid: self.tenantid,
            timelineid: self.timelineid,
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-            dropped: self.dropped,
-            inner: Mutex::new(DeltaLayerInner {
+            key_range: self.key_start..key_end,
+            lsn_range: self.lsn_range.clone(),
+            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
+                index: HashMap::new(),
                book: None,
-                page_version_metas: VecMap::default(),
-                seg_sizes: VecMap::default(),
            }),
        };

-        trace!("created delta layer {}", &layer.path().display());
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timelineid,
+            self.tenantid,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;
+
+        trace!("created delta layer {}", final_path.display());

        Ok(layer)
    }
+
+    pub fn abort(self) {
+        match self.values_writer.close() {
+            Ok(book) => {
+                if let Err(err) = book.close() {
+                    error!("error while closing delta layer file: {}", err);
+                }
+            }
+            Err(err) => {
+                error!("error while closing chapter writer: {}", err);
+            }
+        }
+        if let Err(err) = std::fs::remove_file(self.path) {
+            error!("error removing unfinished delta layer file: {}", err);
+        }
+    }
+}
+
+///
+/// Iterator over all key-value pairse stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaValueIter {
+    all_offsets: Vec<(Key, Lsn, BlobRef)>,
+    next_idx: usize,
+    data: Vec<u8>,
+}
+
+impl Iterator for DeltaValueIter {
+    type Item = Result<(Key, Lsn, Value)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_res().transpose()
+    }
+}
+
+impl DeltaValueIter {
+    fn new(inner: RwLockReadGuard<DeltaLayerInner>) -> Result<Self> {
+        let mut index: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
+        index.sort_by_key(|x| x.0);
+
+        let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new();
+        for (key, vec_map) in index.iter() {
+            for (lsn, blob_ref) in vec_map.as_slice().iter() {
+                all_offsets.push((**key, *lsn, *blob_ref));
+            }
+        }
+
+        let values_reader = inner
+            .book
+            .as_ref()
+            .expect("should be loaded in load call above")
+            .chapter_reader(VALUES_CHAPTER)?;
+        let file_size = values_reader.len() as usize;
+        let mut layer = DeltaValueIter {
+            all_offsets,
+            next_idx: 0,
+            data: vec![0u8; file_size],
+        };
+        values_reader.read_exact_at(&mut layer.data, 0)?;
+
+        Ok(layer)
+    }
+
+    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
+        if self.next_idx < self.all_offsets.len() {
+            let (key, lsn, blob_ref) = self.all_offsets[self.next_idx];
+            let offs = blob_ref.pos() as usize;
+            let size = blob_ref.size();
+            let val = Value::des(&self.data[offs..offs + size])?;
+            self.next_idx += 1;
+            Ok(Some((key, lsn, val)))
+        } else {
+            Ok(None)
+        }
+    }
 }
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -2,29 +2,52 @@
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
 use crate::config::PageServerConf;
-use crate::layered_repository::storage_layer::SegmentTag;
-use crate::relish::*;
+use crate::repository::Key;
+use std::cmp::Ordering;
 use std::fmt;
+use std::ops::Range;
 use std::path::PathBuf;

 use zenith_utils::lsn::Lsn;

 // Note: LayeredTimeline::load_layer_map() relies on this sort order
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct DeltaFileName {
-    pub seg: SegmentTag,
-    pub start_lsn: Lsn,
-    pub end_lsn: Lsn,
-    pub dropped: bool,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl PartialOrd for DeltaFileName {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for DeltaFileName {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.key_range.start.cmp(&other.key_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.key_range.end.cmp(&other.key_range.end);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn_range.start.cmp(&other.lsn_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn_range.end.cmp(&other.lsn_range.end);
+
+        cmp
+    }
 }

 /// Represents the filename of a DeltaLayer
 ///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
-///
-/// or if it was dropped:
-///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
+///    <key start>-<key end>__<LSN start>-<LSN end>
 ///
 impl DeltaFileName {
    ///
@@ -32,234 +55,123 @@ impl DeltaFileName {
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
+        let mut parts = fname.split("__");
+        let mut key_parts = parts.next()?.split('-');
+        let mut lsn_parts = parts.next()?.split('-');
+
+        let key_start_str = key_parts.next()?;
+        let key_end_str = key_parts.next()?;
+        let lsn_start_str = lsn_parts.next()?;
+        let lsn_end_str = lsn_parts.next()?;
+        if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
            return None;
        }

-        let segno = parts.next()?.parse::<u32>().ok()?;
+        let key_start = Key::from_hex(key_start_str).ok()?;
+        let key_end = Key::from_hex(key_end_str).ok()?;

-        let seg = SegmentTag { rel, segno };
+        let start_lsn = Lsn::from_hex(lsn_start_str).ok()?;
+        let end_lsn = Lsn::from_hex(lsn_end_str).ok()?;

-        let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
-        let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
-
-        let mut dropped = false;
-        if let Some(suffix) = parts.next() {
-            if suffix == "DROPPED" {
-                dropped = true;
-            } else {
-                return None;
-            }
-        }
-        if parts.next().is_some() {
+        if start_lsn >= end_lsn {
            return None;
+            // or panic?
+        }
+
+        if key_start >= key_end {
+            return None;
+            // or panic?
        }

        Some(DeltaFileName {
-            seg,
-            start_lsn,
-            end_lsn,
-            dropped,
+            key_range: key_start..key_end,
+            lsn_range: start_lsn..end_lsn,
        })
    }
 }

 impl fmt::Display for DeltaFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
-
        write!(
            f,
-            "{}_{}_{:016X}_{:016X}{}",
-            basename,
-            self.seg.segno,
-            u64::from(self.start_lsn),
-            u64::from(self.end_lsn),
-            if self.dropped { "_DROPPED" } else { "" }
+            "{}-{}__{:016X}-{:016X}",
+            self.key_range.start,
+            self.key_range.end,
+            u64::from(self.lsn_range.start),
+            u64::from(self.lsn_range.end),
        )
    }
 }

-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct ImageFileName {
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,
    pub lsn: Lsn,
 }

+impl PartialOrd for ImageFileName {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for ImageFileName {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.key_range.start.cmp(&other.key_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.key_range.end.cmp(&other.key_range.end);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn.cmp(&other.lsn);
+
+        cmp
+    }
+}
+
 ///
 /// Represents the filename of an ImageLayer
 ///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<LSN>
-///
+///    <key start>-<key end>__<LSN>
 impl ImageFileName {
    ///
    /// Parse a string as an image file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
+        let mut parts = fname.split("__");
+        let mut key_parts = parts.next()?.split('-');
+
+        let key_start_str = key_parts.next()?;
+        let key_end_str = key_parts.next()?;
+        let lsn_str = parts.next()?;
+        if parts.next().is_some() || key_parts.next().is_some() {
            return None;
        }

-        let segno = parts.next()?.parse::<u32>().ok()?;
+        let key_start = Key::from_hex(key_start_str).ok()?;
+        let key_end = Key::from_hex(key_end_str).ok()?;

-        let seg = SegmentTag { rel, segno };
+        let lsn = Lsn::from_hex(lsn_str).ok()?;

-        let lsn = Lsn::from_hex(parts.next()?).ok()?;
-
-        if parts.next().is_some() {
-            return None;
-        }
-
-        Some(ImageFileName { seg, lsn })
+        Some(ImageFileName {
+            key_range: key_start..key_end,
+            lsn,
+        })
    }
 }

 impl fmt::Display for ImageFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
-
        write!(
            f,
-            "{}_{}_{:016X}",
-            basename,
-            self.seg.segno,
+            "{}-{}__{:016X}",
+            self.key_range.start,
+            self.key_range.end,
            u64::from(self.lsn),
        )
    }
--- a/pageserver/src/layered_repository/global_layer_map.rs
+++ b/pageserver/src/layered_repository/global_layer_map.rs
@@ -1,142 +0,0 @@
-//!
-//! Global registry of open layers.
-//!
-//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
-//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
-//! in-memory layers in the system, and know when we need to evict some to release
-//! memory.
-//!
-//! Each layer is assigned a unique ID when it's registered in the global registry.
-//! The ID can be used to relocate the layer later, without having to hold locks.
-//!
-
-use std::sync::atomic::{AtomicU8, Ordering};
-use std::sync::{Arc, RwLock};
-
-use super::inmemory_layer::InMemoryLayer;
-
-use lazy_static::lazy_static;
-
-const MAX_USAGE_COUNT: u8 = 5;
-
-lazy_static! {
-    pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
-        RwLock::new(InMemoryLayers::default());
-}
-
-// TODO these types can probably be smaller
-#[derive(PartialEq, Eq, Clone, Copy)]
-pub struct LayerId {
-    index: usize,
-    tag: u64, // to avoid ABA problem
-}
-
-enum SlotData {
-    Occupied(Arc<InMemoryLayer>),
-    /// Vacant slots form a linked list, the value is the index
-    /// of the next vacant slot in the list.
-    Vacant(Option<usize>),
-}
-
-struct Slot {
-    tag: u64,
-    data: SlotData,
-    usage_count: AtomicU8, // for clock algorithm
-}
-
-#[derive(Default)]
-pub struct InMemoryLayers {
-    slots: Vec<Slot>,
-    num_occupied: usize,
-
-    // Head of free-slot list.
-    next_empty_slot_idx: Option<usize>,
-}
-
-impl InMemoryLayers {
-    pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
-        let slot_idx = match self.next_empty_slot_idx {
-            Some(slot_idx) => slot_idx,
-            None => {
-                let idx = self.slots.len();
-                self.slots.push(Slot {
-                    tag: 0,
-                    data: SlotData::Vacant(None),
-                    usage_count: AtomicU8::new(0),
-                });
-                idx
-            }
-        };
-        let slots_len = self.slots.len();
-
-        let slot = &mut self.slots[slot_idx];
-
-        match slot.data {
-            SlotData::Occupied(_) => {
-                panic!("an occupied slot was in the free list");
-            }
-            SlotData::Vacant(next_empty_slot_idx) => {
-                self.next_empty_slot_idx = next_empty_slot_idx;
-            }
-        }
-
-        slot.data = SlotData::Occupied(layer);
-        slot.usage_count.store(1, Ordering::Relaxed);
-
-        self.num_occupied += 1;
-        assert!(self.num_occupied <= slots_len);
-
-        LayerId {
-            index: slot_idx,
-            tag: slot.tag,
-        }
-    }
-
-    pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
-        let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
-        if slot.tag != layer_id.tag {
-            return None;
-        }
-
-        if let SlotData::Occupied(layer) = &slot.data {
-            let _ = slot.usage_count.fetch_update(
-                Ordering::Relaxed,
-                Ordering::Relaxed,
-                |old_usage_count| {
-                    if old_usage_count < MAX_USAGE_COUNT {
-                        Some(old_usage_count + 1)
-                    } else {
-                        None
-                    }
-                },
-            );
-            Some(Arc::clone(layer))
-        } else {
-            None
-        }
-    }
-
-    // TODO this won't be a public API in the future
-    pub fn remove(&mut self, layer_id: &LayerId) {
-        let slot = &mut self.slots[layer_id.index];
-
-        if slot.tag != layer_id.tag {
-            return;
-        }
-
-        match &slot.data {
-            SlotData::Occupied(_layer) => {
-                // TODO evict the layer
-            }
-            SlotData::Vacant(_) => unimplemented!(),
-        }
-
-        slot.data = SlotData::Vacant(self.next_empty_slot_idx);
-        self.next_empty_slot_idx = Some(layer_id.index);
-
-        assert!(self.num_occupied > 0);
-        self.num_occupied -= 1;
-
-        slot.tag = slot.tag.wrapping_add(1);
-    }
-}
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -1,41 +1,39 @@
-//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN.
-//! It is stored in a file on disk.
+//! An ImageLayer represents an image or a snapshot of a key-range at
+//! one particular LSN. It contains an image of all key-value pairs
+//! in its key-range. Any key that falls into the image layer's range
+//! but does not exist in the layer, does not exist.
 //!
-//! On disk, the image files are stored in timelines/<timelineid> directory.
-//! Currently, there are no subdirectories, and each image layer file is named like this:
+//! An image layer is stored in a file on disk. The file is stored in
+//! timelines/<timelineid> directory.  Currently, there are no
+//! subdirectories, and each image layer file is named like this:
 //!
-//! Note that segno is
-//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
+//!    <key start>-<key end>__<LSN>
 //!
 //! For example:
 //!
-//!    1663_13990_2609_0_5_000000000169C348
+//!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
 //!
 //! An image file is constructed using the 'bookfile' crate.
 //!
 //! Only metadata is loaded into memory by the load function.
 //! When images are needed, they are read directly from disk.
 //!
-//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
-//! All the images are required to be BLOCK_SIZE, which allows for random access.
-//!
-//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
-//!
 use crate::config::PageServerConf;
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag,
+    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::layered_repository::RELISH_SEG_SIZE;
+use crate::repository::{Key, Value};
 use crate::virtual_file::VirtualFile;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, Context, Result};
 use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::convert::TryInto;
+use std::collections::HashMap;
 use std::fs;
 use std::io::{BufWriter, Write};
+use std::ops::Range;
 use std::path::{Path, PathBuf};
 use std::sync::{Mutex, MutexGuard};

@@ -44,12 +42,15 @@ use bookfile::{Book, BookWriter, ChapterWriter};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-// Magic constant to identify a Zenith segment image file
-pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
+// Magic constant to identify a Zenith image layer file
+pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E11 + 1;
+
+/// Mapping from (key, lsn) -> page/WAL record
+/// byte ranges in VALUES_CHAPTER
+static INDEX_CHAPTER: u64 = 1;

 /// Contains each block in block # order
-const BLOCKY_IMAGES_CHAPTER: u64 = 1;
-const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
+const VALUES_CHAPTER: u64 = 2;

 /// Contains the [`Summary`] struct
 const SUMMARY_CHAPTER: u64 = 3;
@@ -58,7 +59,7 @@ const SUMMARY_CHAPTER: u64 = 3;
 struct Summary {
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,
+    key_range: Range<Key>,

    lsn: Lsn,
 }
@@ -68,19 +69,17 @@ impl From<&ImageLayer> for Summary {
        Self {
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
-            seg: layer.seg,
+            key_range: layer.key_range.clone(),

            lsn: layer.lsn,
        }
    }
 }

-const BLOCK_SIZE: usize = 8192;
-
 ///
 /// ImageLayer is the in-memory data structure associated with an on-disk image
 /// file.  We keep an ImageLayer in memory for each file, in the LayerMap. If a
-/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'.
+/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'.
 /// Otherwise the struct is just a placeholder for a file that exists on disk,
 /// and it needs to be loaded before using it in queries.
 ///
@@ -88,7 +87,7 @@ pub struct ImageLayer {
    path_or_conf: PathOrConf,
    pub tenantid: ZTenantId,
    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,

    // This entry contains an image of all pages as of this LSN
    pub lsn: Lsn,
@@ -96,18 +95,16 @@ pub struct ImageLayer {
    inner: Mutex<ImageLayerInner>,
 }

-#[derive(Clone)]
-enum ImageType {
-    Blocky { num_blocks: SegmentBlk },
-    NonBlocky,
-}
-
 pub struct ImageLayerInner {
-    /// If None, the 'image_type' has not been loaded into memory yet.
+    /// If false, the 'index' has not been loaded into memory yet.
+    loaded: bool,
+
+    /// The underlying (virtual) file handle. None if the layer hasn't been loaded
+    /// yet.
    book: Option<Book<VirtualFile>>,

-    /// Derived from filename and bookfile chapter metadata
-    image_type: ImageType,
+    /// offset of each value
+    index: HashMap<Key, BlobRef>,
 }

 impl Layer for ImageLayer {
@@ -123,98 +120,78 @@ impl Layer for ImageLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
    }

-    fn is_dropped(&self) -> bool {
-        false
-    }
-
-    fn get_start_lsn(&self) -> Lsn {
-        self.lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
+    fn get_lsn_range(&self) -> Range<Lsn> {
        // End-bound is exclusive
-        self.lsn + 1
+        self.lsn..(self.lsn + 1)
    }

    /// Look up given page in the file
-    fn get_page_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
-        blknum: SegmentBlk,
-        lsn: Lsn,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
-        assert!((0..RELISH_SEG_SIZE).contains(&blknum));
-        assert!(lsn >= self.lsn);
-
-        match reconstruct_data.page_img {
-            Some((cached_lsn, _)) if self.lsn <= cached_lsn => {
-                return Ok(PageReconstructResult::Complete)
-            }
-            _ => {}
-        }
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        assert!(self.key_range.contains(&key));
+        assert!(lsn_range.end >= self.lsn);

        let inner = self.load()?;

-        let buf = match &inner.image_type {
-            ImageType::Blocky { num_blocks } => {
-                // Check if the request is beyond EOF
-                if blknum >= *num_blocks {
-                    return Ok(PageReconstructResult::Missing(lsn));
-                }
+        if let Some(blob_ref) = inner.index.get(&key) {
+            let chapter = inner
+                .book
+                .as_ref()
+                .unwrap()
+                .chapter_reader(VALUES_CHAPTER)?;

-                let mut buf = vec![0u8; BLOCK_SIZE];
-                let offset = BLOCK_SIZE as u64 * blknum as u64;
-
-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-
-                chapter.read_exact_at(&mut buf, offset).with_context(|| {
+            let mut blob = vec![0; blob_ref.size()];
+            chapter
+                .read_exact_at(&mut blob, blob_ref.pos())
+                .with_context(|| {
                    format!(
-                        "failed to read page from data file {} at offset {}",
+                        "failed to read {} bytes from data file {} at offset {}",
+                        blob_ref.size(),
                        self.filename().display(),
-                        offset
+                        blob_ref.pos()
                    )
                })?;
+            let value = Bytes::from(blob);

-                buf
-            }
-            ImageType::NonBlocky => {
-                ensure!(blknum == 0);
-                inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
-                    .into_vec()
-            }
-        };
-
-        reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf)));
-        Ok(PageReconstructResult::Complete)
-    }
-
-    /// Get size of the segment
-    fn get_seg_size(&self, _lsn: Lsn) -> Result<SegmentBlk> {
-        let inner = self.load()?;
-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => Ok(num_blocks),
-            ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
        }
    }

-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
-        Ok(true)
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+        todo!();
    }

    fn unload(&self) -> Result<()> {
+        // Unload the index.
+        //
+        // TODO: we should access the index directly from pages on the disk,
+        // using the buffer cache. This load/unload mechanism is really ad hoc.
+
+        // FIXME: In debug mode, loading and unloading the index slows
+        // things down so much that you get timeout errors. At least
+        // with the test_parallel_copy test. So as an even more ad hoc
+        // stopgap fix for that, only unload every on average 10
+        // checkpoint cycles.
+        use rand::RngCore;
+        if rand::thread_rng().next_u32() > (u32::MAX / 10) {
+            return Ok(());
+        }
+
+        let mut inner = self.inner.lock().unwrap();
+        inner.index = HashMap::default();
+        inner.loaded = false;
+
        Ok(())
    }

@@ -235,22 +212,22 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} seg {} at {} ----",
-            self.tenantid, self.timelineid, self.seg, self.lsn
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
+            self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
        );

        let inner = self.load()?;

-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
-            ImageType::NonBlocky => {
-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
-                println!("non-blocky ({} bytes)", chapter.len());
-            }
+        let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect();
+        index_vec.sort_by_key(|x| x.1.pos());
+
+        for (key, blob_ref) in index_vec {
+            println!(
+                "key: {} size {} offset {}",
+                key,
+                blob_ref.size(),
+                blob_ref.pos()
+            );
        }

        Ok(())
@@ -279,19 +256,21 @@ impl ImageLayer {
        // quick exit if already loaded
        let mut inner = self.inner.lock().unwrap();

-        if inner.book.is_some() {
+        if inner.loaded {
            return Ok(inner);
        }

        let path = self.path();
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
-        let book = Book::new(file).with_context(|| {
-            format!(
-                "Failed to open virtual file '{}' as a bookfile",
-                path.display()
-            )
-        })?;
+
+        // Open the file if it's not open already.
+        if inner.book.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.book = Some(Book::new(file).with_context(|| {
+                format!("Failed to open file '{}' as a bookfile", path.display())
+            })?);
+        }
+        let book = inner.book.as_ref().unwrap();

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -318,23 +297,13 @@ impl ImageLayer {
            }
        }

-        let image_type = if self.seg.rel.is_blocky() {
-            let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-            let images_len = chapter.len();
-            ensure!(images_len % BLOCK_SIZE as u64 == 0);
-            let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?;
-            ImageType::Blocky { num_blocks }
-        } else {
-            let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
-            ImageType::NonBlocky
-        };
+        let chapter = book.read_chapter(INDEX_CHAPTER)?;
+        let index = HashMap::des(&chapter)?;

-        debug!("loaded from {}", &path.display());
+        info!("loaded from {}", &path.display());

-        *inner = ImageLayerInner {
-            book: Some(book),
-            image_type,
-        };
+        inner.index = index;
+        inner.loaded = true;

        Ok(inner)
    }
@@ -350,11 +319,12 @@ impl ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
+            key_range: filename.key_range.clone(),
            lsn: filename.lsn,
            inner: Mutex::new(ImageLayerInner {
                book: None,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                index: HashMap::new(),
+                loaded: false,
            }),
        }
    }
@@ -373,18 +343,19 @@ impl ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timelineid: summary.timelineid,
            tenantid: summary.tenantid,
-            seg: summary.seg,
+            key_range: summary.key_range,
            lsn: summary.lsn,
            inner: Mutex::new(ImageLayerInner {
                book: None,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                index: HashMap::new(),
+                loaded: false,
            }),
        })
    }

    fn layer_name(&self) -> ImageFileName {
        ImageFileName {
-            seg: self.seg,
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
        }
    }
@@ -413,15 +384,18 @@ impl ImageLayer {
 ///
 pub struct ImageLayerWriter {
    conf: &'static PageServerConf,
+    path: PathBuf,
    timelineid: ZTimelineId,
    tenantid: ZTenantId,
-    seg: SegmentTag,
+    key_range: Range<Key>,
    lsn: Lsn,

-    num_blocks: SegmentBlk,
+    values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
+    end_offset: u64,

-    page_image_writer: ChapterWriter<BufWriter<VirtualFile>>,
-    num_blocks_written: SegmentBlk,
+    index: HashMap<Key, BlobRef>,
+
+    finished: bool,
 }

 impl ImageLayerWriter {
@@ -429,9 +403,8 @@ impl ImageLayerWriter {
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
-        seg: SegmentTag,
+        key_range: &Range<Key>,
        lsn: Lsn,
-        num_blocks: SegmentBlk,
    ) -> Result<ImageLayerWriter> {
        // Create the file
        //
@@ -441,70 +414,75 @@ impl ImageLayerWriter {
            &PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            &ImageFileName { seg, lsn },
+            &ImageFileName {
+                key_range: key_range.clone(),
+                lsn,
+            },
        );
+        info!("new image layer {}", path.display());
        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;

        // Open the page-images chapter for writing. The calls to
-        // `put_page_image` will use this to write the contents.
-        let chapter = if seg.rel.is_blocky() {
-            book.new_chapter(BLOCKY_IMAGES_CHAPTER)
-        } else {
-            assert_eq!(num_blocks, 1);
-            book.new_chapter(NONBLOCKY_IMAGE_CHAPTER)
-        };
+        // `put_image` will use this to write the contents.
+        let chapter = book.new_chapter(VALUES_CHAPTER);

        let writer = ImageLayerWriter {
            conf,
+            path,
            timelineid,
            tenantid,
-            seg,
+            key_range: key_range.clone(),
            lsn,
-            num_blocks,
-            page_image_writer: chapter,
-            num_blocks_written: 0,
+            values_writer: Some(chapter),
+            index: HashMap::new(),
+            end_offset: 0,
+            finished: false,
        };

        Ok(writer)
    }

    ///
-    /// Write next page image to the file.
+    /// Write next value to the file.
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> {
-        assert!(self.num_blocks_written < self.num_blocks);
-        if self.seg.rel.is_blocky() {
-            assert_eq!(block_bytes.len(), BLOCK_SIZE);
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+        assert!(self.key_range.contains(&key));
+        let off = self.end_offset;
+
+        if let Some(writer) = &mut self.values_writer {
+            let len = img.len();
+            writer.write_all(img)?;
+            self.end_offset += len as u64;
+
+            let old = self.index.insert(key, BlobRef::new(off, len, true));
+            assert!(old.is_none());
+        } else {
+            panic!()
        }
-        self.page_image_writer.write_all(block_bytes)?;
-        self.num_blocks_written += 1;
+
        Ok(())
    }

-    pub fn finish(self) -> Result<ImageLayer> {
-        // Check that the `put_page_image' was called for every block.
-        assert!(self.num_blocks_written == self.num_blocks);
+    pub fn finish(&mut self) -> Result<ImageLayer> {
+        // Close the values chapter
+        let book = self.values_writer.take().unwrap().close()?;

-        // Close the page-images chapter
-        let book = self.page_image_writer.close()?;
+        // Write out the index
+        let mut chapter = book.new_chapter(INDEX_CHAPTER);
+        let buf = HashMap::ser(&self.index)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;

        // Write out the summary chapter
-        let image_type = if self.seg.rel.is_blocky() {
-            ImageType::Blocky {
-                num_blocks: self.num_blocks,
-            }
-        } else {
-            ImageType::NonBlocky
-        };
        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
        let summary = Summary {
            tenantid: self.tenantid,
            timelineid: self.timelineid,
-            seg: self.seg,
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
        };
        Summary::ser_into(&summary, &mut chapter)?;
@@ -520,15 +498,31 @@ impl ImageLayerWriter {
            path_or_conf: PathOrConf::Conf(self.conf),
            timelineid: self.timelineid,
            tenantid: self.tenantid,
-            seg: self.seg,
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
            inner: Mutex::new(ImageLayerInner {
                book: None,
-                image_type,
+                loaded: false,
+                index: HashMap::new(),
            }),
        };
        trace!("created image layer {}", layer.path().display());

+        self.finished = true;
+
        Ok(layer)
    }
 }
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(page_image_writer) = self.values_writer.take() {
+            if let Ok(book) = page_image_writer.close() {
+                let _ = book.close();
+            }
+        }
+        if !self.finished {
+            let _ = fs::remove_file(&self.path);
+        }
+    }
+}
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -1,30 +1,29 @@
-//! An in-memory layer stores recently received PageVersions.
-//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
-//! and layers can be spilled to disk into ephemeral files.
+//! An in-memory layer stores recently received key-value pairs.
 //!
-//! And there's another BTreeMap to track the size of the relation.
+//! The "in-memory" part of the name is a bit misleading: the actual page versions are
+//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
+//! its position in the file, is kept in memory, though.
 //!
 use crate::config::PageServerConf;
 use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::layered_repository::ephemeral_file::EphemeralFile;
-use crate::layered_repository::filename::DeltaFileName;
-use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter};
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
-    RELISH_SEG_SIZE,
+    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::layered_repository::LayeredTimeline;
-use crate::layered_repository::ZERO_PAGE;
-use crate::repository::ZenithWalRecord;
+use crate::repository::{Key, Value};
+use crate::walrecord;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{ensure, Result};
-use bytes::Bytes;
+use anyhow::Result;
 use log::*;
 use std::collections::HashMap;
-use std::io::Seek;
+// avoid binding to Write (conflicts with std::io::Write)
+// while being able to use std::fmt::Write's methods
+use std::fmt::Write as _;
+use std::io::Write;
+use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::PathBuf;
-use std::sync::{Arc, RwLock};
+use std::sync::RwLock;
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::vec_map::VecMap;
@@ -33,7 +32,6 @@ pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,

    ///
    /// This layer contains all the changes from 'start_lsn'. The
@@ -41,27 +39,9 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

-    ///
-    /// LSN of the oldest page version stored in this layer.
-    ///
-    /// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
-    /// of a layer always matches the 'end_lsn' of its predecessor, even if there
-    /// are no page versions until at a later LSN. That way you can detect any
-    /// missing layer files more easily. 'oldest_lsn' is the first page version
-    /// actually stored in this layer. In the range between 'start_lsn' and
-    /// 'oldest_lsn', there are no changes to the segment.
-    /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
-    /// point to the beginning of WAL record. This is the other difference with 'start_lsn'
-    /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
-    ///
-    oldest_lsn: Lsn,
-
    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
-
-    /// Predecessor layer might be needed?
-    incremental: bool,
 }

 pub struct InMemoryLayerInner {
@@ -69,98 +49,25 @@ pub struct InMemoryLayerInner {
    /// Writes are only allowed when this is None
    end_lsn: Option<Lsn>,

-    /// If this relation was dropped, remember when that happened.
-    /// The drop LSN is recorded in [`end_lsn`].
-    dropped: bool,
+    ///
+    /// All versions of all pages in the layer are kept here.  Indexed
+    /// by block number and LSN. The value is an offset into the
+    /// ephemeral file where the page version is stored.
+    ///
+    index: HashMap<Key, VecMap<Lsn, BlobRef>>,

-    /// The PageVersion structs are stored in a serialized format in this file.
-    /// Each serialized PageVersion is preceded by a 'u32' length field.
-    /// 'page_versions' map stores offsets into this file.
+    /// The values are stored in a serialized format in this file.
+    /// Each serialized Value is preceded by a 'u32' length field.
+    /// PerSeg::page_versions map stores offsets into this file.
    file: EphemeralFile,

-    /// Metadata about all versions of all pages in the layer is kept
-    /// here.  Indexed by block number and LSN. The value is an offset
-    /// into the ephemeral file where the page version is stored.
-    page_versions: HashMap<SegmentBlk, VecMap<Lsn, u64>>,
-
-    ///
-    /// `seg_sizes` tracks the size of the segment at different points in time.
-    ///
-    /// For a blocky rel, there is always one entry, at the layer's start_lsn,
-    /// so that determining the size never depends on the predecessor layer. For
-    /// a non-blocky rel, 'seg_sizes' is not used and is always empty.
-    ///
-    seg_sizes: VecMap<Lsn, SegmentBlk>,
-
-    ///
-    /// LSN of the newest page version stored in this layer.
-    ///
-    /// The difference between 'end_lsn' and 'latest_lsn' is the same as between
-    /// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
-    ///
-    latest_lsn: Lsn,
+    end_offset: u64,
 }

 impl InMemoryLayerInner {
    fn assert_writeable(&self) {
        assert!(self.end_lsn.is_none());
    }
-
-    fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk {
-        // Scan the BTreeMap backwards, starting from the given entry.
-        let slice = self.seg_sizes.slice_range(..=lsn);
-
-        // We make sure there is always at least one entry
-        if let Some((_entry_lsn, entry)) = slice.last() {
-            *entry
-        } else {
-            panic!("could not find seg size in in-memory layer");
-        }
-    }
-
-    ///
-    /// Read a page version from the ephemeral file.
-    ///
-    fn read_pv(&self, off: u64) -> Result<PageVersion> {
-        let mut buf = Vec::new();
-        self.read_pv_bytes(off, &mut buf)?;
-        Ok(PageVersion::des(&buf)?)
-    }
-
-    ///
-    /// Read a page version from the ephemeral file, as raw bytes, at
-    /// the given offset.  The bytes are read into 'buf', which is
-    /// expanded if necessary. Returns the size of the page version.
-    ///
-    fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
-        // read length
-        let mut lenbuf = [0u8; 4];
-        self.file.read_exact_at(&mut lenbuf, off)?;
-        let len = u32::from_ne_bytes(lenbuf) as usize;
-
-        if buf.len() < len {
-            buf.resize(len, 0);
-        }
-        self.file.read_exact_at(&mut buf[0..len], off + 4)?;
-        Ok(len)
-    }
-
-    fn write_pv(&mut self, pv: &PageVersion) -> Result<u64> {
-        // remember starting position
-        let pos = self.file.stream_position()?;
-
-        // make room for the 'length' field by writing zeros as a placeholder.
-        self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
-
-        pv.ser_into(&mut self.file).unwrap();
-
-        // write the 'length' field.
-        let len = self.file.stream_position()? - pos - 4;
-        let lenbuf = u32::to_ne_bytes(len as u32);
-        self.file.write_all_at(&lenbuf, pos)?;
-
-        Ok(pos)
-    }
 }

 impl Layer for InMemoryLayer {
@@ -170,21 +77,12 @@ impl Layer for InMemoryLayer {
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();

-        let end_lsn = if let Some(drop_lsn) = inner.end_lsn {
-            drop_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));

-        let delta_filename = DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn,
-            dropped: inner.dropped,
-        }
-        .to_string();
-
-        PathBuf::from(format!("inmem-{}", delta_filename))
+        PathBuf::from(format!(
+            "inmem-{:016X}-{:016X}",
+            self.start_lsn.0, end_lsn.0
+        ))
    }

    fn get_tenant_id(&self) -> ZTenantId {
@@ -195,132 +93,78 @@ impl Layer for InMemoryLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        Key::MIN..Key::MAX
    }

-    fn get_start_lsn(&self) -> Lsn {
-        self.start_lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
+    fn get_lsn_range(&self) -> Range<Lsn> {
        let inner = self.inner.read().unwrap();

-        if let Some(end_lsn) = inner.end_lsn {
+        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
            end_lsn
        } else {
            Lsn(u64::MAX)
-        }
+        };
+        self.start_lsn..end_lsn
    }

-    fn is_dropped(&self) -> bool {
-        let inner = self.inner.read().unwrap();
-        inner.dropped
-    }
-
-    /// Look up given page in the cache.
-    fn get_page_reconstruct_data(
+    /// Look up given value in the layer.
+    fn get_value_reconstruct_data(
        &self,
-        blknum: SegmentBlk,
-        lsn: Lsn,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        assert!(lsn_range.start <= self.start_lsn);
        let mut need_image = true;

-        assert!((0..RELISH_SEG_SIZE).contains(&blknum));
+        let inner = self.inner.read().unwrap();

-        {
-            let inner = self.inner.read().unwrap();
-
-            // Scan the page versions backwards, starting from `lsn`.
-            if let Some(vec_map) = inner.page_versions.get(&blknum) {
-                let slice = vec_map.slice_range(..=lsn);
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    match &reconstruct_data.page_img {
-                        Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
-                            return Ok(PageReconstructResult::Complete)
-                        }
-                        _ => {}
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, blob_ref) in slice.iter().rev() {
+                match &reconstruct_state.img {
+                    Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
+                        return Ok(ValueReconstructResult::Complete)
                    }
+                    _ => {}
+                }

-                    let pv = inner.read_pv(*pos)?;
-                    match pv {
-                        PageVersion::Page(img) => {
-                            reconstruct_data.page_img = Some((*entry_lsn, img));
+                let mut buf = vec![0u8; blob_ref.size()];
+                inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
                            need_image = false;
                            break;
                        }
-                        PageVersion::Wal(rec) => {
-                            reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                            if rec.will_init() {
-                                // This WAL record initializes the page, so no need to go further back
-                                need_image = false;
-                                break;
-                            }
-                        }
                    }
                }
            }
-
-            // If we didn't find any records for this, check if the request is beyond EOF
-            if need_image
-                && reconstruct_data.records.is_empty()
-                && self.seg.rel.is_blocky()
-                && blknum >= self.get_seg_size(lsn)?
-            {
-                return Ok(PageReconstructResult::Missing(self.start_lsn));
-            }
-
-            // release lock on 'inner'
        }

+        // release lock on 'inner'
+
        // If an older page image is needed to reconstruct the page, let the
-        // caller know
+        // caller know.
        if need_image {
-            if self.incremental {
-                Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
-            } else {
-                Ok(PageReconstructResult::Missing(self.start_lsn))
-            }
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok(PageReconstructResult::Complete)
+            Ok(ValueReconstructResult::Complete)
        }
    }

-    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
-        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );
-
-        let inner = self.inner.read().unwrap();
-        Ok(inner.get_seg_size(lsn))
-    }
-
-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
-        let inner = self.inner.read().unwrap();
-
-        // If the segment created after requested LSN,
-        // it doesn't exist in the layer. But we shouldn't
-        // have requested it in the first place.
-        assert!(lsn >= self.start_lsn);
-
-        // Is the requested LSN after the segment was dropped?
-        if inner.dropped {
-            if let Some(end_lsn) = inner.end_lsn {
-                if lsn >= end_lsn {
-                    return Ok(false);
-                }
-            } else {
-                panic!("dropped in-memory layer with no end LSN");
-            }
-        }
-
-        // Otherwise, it exists
-        Ok(true)
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+        todo!();
    }

    /// Cannot unload anything in an in-memory layer, since there's no backing
@@ -337,7 +181,8 @@ impl Layer for InMemoryLayer {
    }

    fn is_incremental(&self) -> bool {
-        self.incremental
+        // in-memory layer is always considered incremental.
+        true
    }

    fn is_in_memory(&self) -> bool {
@@ -355,29 +200,36 @@ impl Layer for InMemoryLayer {
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for tli {} seg {} {}-{} {} ----",
-            self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
+            "----- in-memory layer for tli {} LSNs {}-{} ----",
+            self.timelineid, self.start_lsn, end_str,
        );

-        for (k, v) in inner.seg_sizes.as_slice() {
-            println!("seg_sizes {}: {}", k, v);
-        }
-
-        // List the blocks in order
-        let mut page_versions: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> =
-            inner.page_versions.iter().collect();
-        page_versions.sort_by_key(|k| k.0);
-
-        for (blknum, versions) in page_versions {
-            for (lsn, off) in versions.as_slice() {
-                let pv = inner.read_pv(*off);
-                let pv_description = match pv {
-                    Ok(PageVersion::Page(_img)) => "page",
-                    Ok(PageVersion::Wal(_rec)) => "wal",
-                    Err(_err) => "INVALID",
-                };
-
-                println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
+        let mut buf = Vec::new();
+        for (key, vec_map) in inner.index.iter() {
+            for (lsn, blob_ref) in vec_map.as_slice() {
+                let mut desc = String::new();
+                buf.resize(blob_ref.size(), 0);
+                inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
+                let val = Value::des(&buf);
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec);
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                    }
+                }
+                println!("  key {} at {}: {}", key, lsn, desc);
            }
        }

@@ -385,23 +237,7 @@ impl Layer for InMemoryLayer {
    }
 }

-/// A result of an inmemory layer data being written to disk.
-pub struct LayersOnDisk {
-    pub delta_layers: Vec<DeltaLayer>,
-    pub image_layers: Vec<ImageLayer>,
-}
-
 impl InMemoryLayer {
-    /// Return the oldest page version that's stored in this layer
-    pub fn get_oldest_lsn(&self) -> Lsn {
-        self.oldest_lsn
-    }
-
-    pub fn get_latest_lsn(&self) -> Lsn {
-        let inner = self.inner.read().unwrap();
-        inner.latest_lsn
-    }
-
    ///
    /// Create a new, empty, in-memory layer
    ///
@@ -409,268 +245,89 @@ impl InMemoryLayer {
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
-        seg: SegmentTag,
        start_lsn: Lsn,
-        oldest_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        trace!(
-            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
-            seg,
+            "initializing new empty InMemoryLayer for writing on timeline {} at {}",
            timelineid,
            start_lsn
        );

-        // The segment is initially empty, so initialize 'seg_sizes' with 0.
-        let mut seg_sizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            seg_sizes.append(start_lsn, 0).unwrap();
-        }
-
        let file = EphemeralFile::create(conf, tenantid, timelineid)?;

        Ok(InMemoryLayer {
            conf,
            timelineid,
            tenantid,
-            seg,
            start_lsn,
-            oldest_lsn,
-            incremental: false,
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
-                dropped: false,
+                index: HashMap::new(),
                file,
-                page_versions: HashMap::new(),
-                seg_sizes,
-                latest_lsn: oldest_lsn,
+                end_offset: 0,
            }),
        })
    }

    // Write operations

-    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(
-        &self,
-        lsn: Lsn,
-        blknum: SegmentBlk,
-        rec: ZenithWalRecord,
-    ) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
-    }
-
-    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Page(img))
-    }
-
    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result<u32> {
-        assert!((0..RELISH_SEG_SIZE).contains(&blknum));
-
-        trace!(
-            "put_page_version blk {} of {} at {}/{}",
-            blknum,
-            self.seg.rel,
-            self.timelineid,
-            lsn
-        );
+    pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
        let mut inner = self.inner.write().unwrap();

        inner.assert_writeable();
-        assert!(lsn >= inner.latest_lsn);
-        inner.latest_lsn = lsn;

-        // Write the page version to the file, and remember its offset in 'page_versions'
-        {
-            let off = inner.write_pv(&pv)?;
-            let vec_map = inner.page_versions.entry(blknum).or_default();
-            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-            if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!(
-                    "Page version of rel {} blk {} at {} already exists",
-                    self.seg.rel, blknum, lsn
-                );
-            }
-        }
-
-        // Also update the relation size, if this extended the relation.
-        if self.seg.rel.is_blocky() {
-            let newsize = blknum + 1;
-
-            // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
-            // which we've just acquired above
-            let oldsize = inner.get_seg_size(lsn);
-            if newsize > oldsize {
-                trace!(
-                    "enlarging segment {} from {} to {} blocks at {}",
-                    self.seg,
-                    oldsize,
-                    newsize,
-                    lsn
-                );
-
-                // If we are extending the relation by more than one page, initialize the "gap"
-                // with zeros
-                //
-                // XXX: What if the caller initializes the gap with subsequent call with same LSN?
-                // I don't think that can happen currently, but that is highly dependent on how
-                // PostgreSQL writes its WAL records and there's no guarantee of it. If it does
-                // happen, we would hit the "page version already exists" warning above on the
-                // subsequent call to initialize the gap page.
-                for gapblknum in oldsize..blknum {
-                    let zeropv = PageVersion::Page(ZERO_PAGE.clone());
-                    trace!(
-                        "filling gap blk {} with zeros for write of {}",
-                        gapblknum,
-                        blknum
-                    );
-
-                    // Write the page version to the file, and remember its offset in
-                    // 'page_versions'
-                    {
-                        let off = inner.write_pv(&zeropv)?;
-                        let vec_map = inner.page_versions.entry(gapblknum).or_default();
-                        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-                        if old.is_some() {
-                            warn!(
-                                "Page version of seg {} blk {} at {} already exists",
-                                self.seg, gapblknum, lsn
-                            );
-                        }
-                    }
-                }
-
-                inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap();
-                return Ok(newsize - oldsize);
-            }
-        }
-
-        Ok(0)
-    }
-
-    /// Remember that the relation was truncated at given LSN
-    pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) {
-        assert!(
-            self.seg.rel.is_blocky(),
-            "put_truncation() called on a non-blocky rel"
-        );
-
-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
-
-        // check that this we truncate to a smaller size than segment was before the truncation
-        let old_size = inner.get_seg_size(lsn);
-        assert!(new_size < old_size);
-
-        let (old, _delta_size) = inner
-            .seg_sizes
-            .append_or_update_last(lsn, new_size)
-            .unwrap();
+        let off = inner.end_offset;
+        let buf = Value::ser(&val)?;
+        let len = buf.len();
+        inner.file.write_all(&buf)?;
+        inner.end_offset += len as u64;

+        let vec_map = inner.index.entry(key).or_default();
+        let blob_ref = BlobRef::new(off, len, val.will_init());
+        let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
-            warn!("Inserting truncation, but had an entry for the LSN already");
-        }
-    }
-
-    /// Remember that the segment was dropped at given LSN
-    pub fn drop_segment(&self, lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
-
-        assert!(inner.end_lsn.is_none());
-        assert!(!inner.dropped);
-        inner.dropped = true;
-        assert!(self.start_lsn < lsn);
-        inner.end_lsn = Some(lsn);
-
-        trace!("dropped segment {} at {}", self.seg, lsn);
-    }
-
-    ///
-    /// Initialize a new InMemoryLayer for, by copying the state at the given
-    /// point in time from given existing layer.
-    ///
-    pub fn create_successor_layer(
-        conf: &'static PageServerConf,
-        src: Arc<dyn Layer>,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-        start_lsn: Lsn,
-        oldest_lsn: Lsn,
-    ) -> Result<InMemoryLayer> {
-        let seg = src.get_seg_tag();
-
-        assert!(oldest_lsn.is_aligned());
-
-        trace!(
-            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
-            seg,
-            timelineid,
-            start_lsn,
-        );
-
-        // Copy the segment size at the start LSN from the predecessor layer.
-        let mut seg_sizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            let size = src.get_seg_size(start_lsn)?;
-            seg_sizes.append(start_lsn, size).unwrap();
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        Ok(InMemoryLayer {
-            conf,
-            timelineid,
-            tenantid,
-            seg,
-            start_lsn,
-            oldest_lsn,
-            incremental: true,
-            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
-                dropped: false,
-                file,
-                page_versions: HashMap::new(),
-                seg_sizes,
-                latest_lsn: oldest_lsn,
-            }),
-        })
+        Ok(())
    }

-    pub fn is_writeable(&self) -> bool {
-        let inner = self.inner.read().unwrap();
-        inner.end_lsn.is_none()
+    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+        // TODO: Currently, we just leak the storage for any deleted keys
+
+        Ok(())
    }

    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
-    /// `end_lsn` is inclusive
+    /// `end_lsn` is exclusive
    pub fn freeze(&self, end_lsn: Lsn) {
        let mut inner = self.inner.write().unwrap();

-        if inner.end_lsn.is_some() {
-            assert!(inner.dropped);
-        } else {
-            assert!(!inner.dropped);
-            assert!(self.start_lsn < end_lsn + 1);
-            inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
+        assert!(self.start_lsn < end_lsn);
+        inner.end_lsn = Some(end_lsn);

-            if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() {
-                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
-            }
+        // FIXME
+        /*
+                for perseg in inner.segs.values() {
+                    if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() {
+                        assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn);
+                    }

-            for (_blk, vec_map) in inner.page_versions.iter() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    assert!(*lsn <= end_lsn);
+                    for (_blk, vec_map) in perseg.page_versions.iter() {
+                        for (lsn, _pos) in vec_map.as_slice() {
+                            assert!(*lsn < end_lsn);
+                        }
+                    }
                }
-            }
-        }
+        */
    }

-    /// Write the this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns new layers that replace this one.
    /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
@@ -678,17 +335,7 @@ impl InMemoryLayer {
    /// WAL records between start and end LSN. (The delta layer is not needed
    /// when a new relish is created with a single LSN, so that the start and
    /// end LSN are the same.)
-    pub fn write_to_disk(
-        &self,
-        timeline: &LayeredTimeline,
-        reconstruct_pages: bool,
-    ) -> Result<LayersOnDisk> {
-        trace!(
-            "write_to_disk {} get_end_lsn is {}",
-            self.filename().display(),
-            self.get_end_lsn()
-        );
-
+    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -700,105 +347,32 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();

-        // Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN
-        // that is included.
-        let end_lsn_exclusive = inner.end_lsn.unwrap();
-        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            Key::MIN,
+            self.start_lsn..inner.end_lsn.unwrap(),
+        )?;

-        // Figure out if we should create a delta layer, image layer, or both.
-        let image_lsn: Option<Lsn>;
-        let delta_end_lsn: Option<Lsn>;
-        if self.is_dropped() || !reconstruct_pages {
-            // The segment was dropped. Create just a delta layer containing all the
-            // changes up to and including the drop.
-            delta_end_lsn = Some(end_lsn_exclusive);
-            image_lsn = None;
-        } else if self.start_lsn == end_lsn_inclusive {
-            // The layer contains exactly one LSN. It's enough to write an image
-            // layer at that LSN.
-            delta_end_lsn = None;
-            image_lsn = Some(end_lsn_inclusive);
-        } else {
-            // Create a delta layer with all the changes up to the end LSN,
-            // and an image layer at the end LSN.
-            //
-            // Note that we the delta layer does *not* include the page versions
-            // at the end LSN. They are included in the image layer, and there's
-            // no need to store them twice.
-            delta_end_lsn = Some(end_lsn_inclusive);
-            image_lsn = Some(end_lsn_inclusive);
-        }
-
-        let mut delta_layers = Vec::new();
-        let mut image_layers = Vec::new();
-
-        if let Some(delta_end_lsn) = delta_end_lsn {
-            let mut delta_layer_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timelineid,
-                self.tenantid,
-                self.seg,
-                self.start_lsn,
-                delta_end_lsn,
-                self.is_dropped(),
-            )?;
-
-            // Write all page versions, in block + LSN order
-            let mut buf: Vec<u8> = Vec::new();
-
-            let pv_iter = inner.page_versions.iter();
-            let mut pages: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> = pv_iter.collect();
-            pages.sort_by_key(|(blknum, _vec_map)| *blknum);
-            for (blknum, vec_map) in pages {
-                for (lsn, pos) in vec_map.as_slice() {
-                    if *lsn < delta_end_lsn {
-                        let len = inner.read_pv_bytes(*pos, &mut buf)?;
-                        delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?;
-                    }
+        let mut do_steps = || -> Result<()> {
+            for (key, vec_map) in inner.index.iter() {
+                // Write all page versions
+                for (lsn, blob_ref) in vec_map.as_slice() {
+                    let mut buf = vec![0u8; blob_ref.size()];
+                    inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
+                    let val = Value::des(&buf)?;
+                    delta_layer_writer.put_value(*key, *lsn, val)?;
                }
            }
-
-            // Create seg_sizes
-            let seg_sizes = if delta_end_lsn == end_lsn_exclusive {
-                inner.seg_sizes.clone()
-            } else {
-                inner.seg_sizes.split_at(&end_lsn_exclusive).0
-            };
-
-            let delta_layer = delta_layer_writer.finish(seg_sizes)?;
-            delta_layers.push(delta_layer);
+            Ok(())
+        };
+        if let Err(err) = do_steps() {
+            delta_layer_writer.abort();
+            return Err(err);
        }

-        drop(inner);
-
-        // Write a new base image layer at the cutoff point
-        if let Some(image_lsn) = image_lsn {
-            let size = if self.seg.rel.is_blocky() {
-                self.get_seg_size(image_lsn)?
-            } else {
-                1
-            };
-            let mut image_layer_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timelineid,
-                self.tenantid,
-                self.seg,
-                image_lsn,
-                size,
-            )?;
-
-            for blknum in 0..size {
-                let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?;
-
-                image_layer_writer.put_page_image(&img)?;
-            }
-            let image_layer = image_layer_writer.finish()?;
-            image_layers.push(image_layer);
-        }
-
-        Ok(LayersOnDisk {
-            delta_layers,
-            image_layers,
-        })
+        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        Ok(delta_layer)
    }
 }
--- a/pageserver/src/layered_repository/interval_tree.rs
+++ b/pageserver/src/layered_repository/interval_tree.rs
@@ -1,468 +0,0 @@
-///
-/// IntervalTree is data structure for holding intervals. It is generic
-/// to make unit testing possible, but the only real user of it is the layer map,
-///
-/// It's inspired by the "segment tree" or a "statistic tree" as described in
-/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
-/// the points instead of a binary tree. This is called an "interval tree" instead
-/// of "segment tree" because the term "segment" is already using Zenith to mean
-/// something else. To add to the confusion, there is another data structure known
-/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
-/// for storing intervals, but this isn't that.
-///
-/// The basic idea is to have a B-tree of "interesting Points". At each Point,
-/// there is a list of intervals that contain the point. The Points are formed
-/// from the start bounds of each interval; there is a Point for each distinct
-/// start bound.
-///
-/// Operations:
-///
-/// To find intervals that contain a given point, you search the b-tree to find
-/// the nearest Point <= search key. Then you just return the list of intervals.
-///
-/// To insert an interval, find the Point with start key equal to the inserted item.
-/// If the Point doesn't exist yet, create it, by copying all the items from the
-/// previous Point that cover the new Point. Then walk right, inserting the new
-/// interval to all the Points that are contained by the new interval (including the
-/// newly created Point).
-///
-/// To remove an interval, you scan the tree for all the Points that are contained by
-/// the removed interval, and remove it from the list in each Point.
-///
-/// Requirements and assumptions:
-///
-/// - Can store overlapping items
-/// - But there are not many overlapping items
-/// - The interval bounds don't change after it is added to the tree
-/// - Intervals are uniquely identified by pointer equality. You must not be insert the
-///   same interval object twice, and `remove` uses pointer equality to remove the right
-///   interval. It is OK to have two intervals with the same bounds, however.
-///
-use std::collections::BTreeMap;
-use std::fmt::Debug;
-use std::ops::Range;
-use std::sync::Arc;
-
-pub struct IntervalTree<I: ?Sized>
-where
-    I: IntervalItem,
-{
-    points: BTreeMap<I::Key, Point<I>>,
-}
-
-struct Point<I: ?Sized> {
-    /// All intervals that contain this point, in no particular order.
-    ///
-    /// We assume that there aren't a lot of overlappingg intervals, so that this vector
-    /// never grows very large. If that assumption doesn't hold, we could keep this ordered
-    /// by the end bound, to speed up `search`. But as long as there are only a few elements,
-    /// a linear search is OK.
-    elements: Vec<Arc<I>>,
-}
-
-/// Abstraction for an interval that can be stored in the tree
-///
-/// The start bound is inclusive and the end bound is exclusive. End must be greater
-/// than start.
-pub trait IntervalItem {
-    type Key: Ord + Copy + Debug + Sized;
-
-    fn start_key(&self) -> Self::Key;
-    fn end_key(&self) -> Self::Key;
-
-    fn bounds(&self) -> Range<Self::Key> {
-        self.start_key()..self.end_key()
-    }
-}
-
-impl<I: ?Sized> IntervalTree<I>
-where
-    I: IntervalItem,
-{
-    /// Return an element that contains 'key', or precedes it.
-    ///
-    /// If there are multiple candidates, returns the one with the highest 'end' key.
-    pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
-        // Find the greatest point that precedes or is equal to the search key. If there is
-        // none, returns None.
-        let (_, p) = self.points.range(..=key).next_back()?;
-
-        // Find the element with the highest end key at this point
-        let highest_item = p
-            .elements
-            .iter()
-            .reduce(|a, b| {
-                // starting with Rust 1.53, could use `std::cmp::min_by_key` here
-                if a.end_key() > b.end_key() {
-                    a
-                } else {
-                    b
-                }
-            })
-            .unwrap();
-        Some(Arc::clone(highest_item))
-    }
-
-    /// Iterate over all items with start bound >= 'key'
-    pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
-        IntervalIter {
-            point_iter: self.points.range(key..),
-            elem_iter: None,
-        }
-    }
-
-    /// Iterate over all items
-    pub fn iter(&self) -> IntervalIter<I> {
-        IntervalIter {
-            point_iter: self.points.range(..),
-            elem_iter: None,
-        }
-    }
-
-    pub fn insert(&mut self, item: Arc<I>) {
-        let start_key = item.start_key();
-        let end_key = item.end_key();
-        assert!(start_key < end_key);
-        let bounds = start_key..end_key;
-
-        // Find the starting point and walk forward from there
-        let mut found_start_point = false;
-        let iter = self.points.range_mut(bounds);
-        for (point_key, point) in iter {
-            if *point_key == start_key {
-                found_start_point = true;
-                // It is an error to insert the same item to the tree twice.
-                assert!(
-                    !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
-                    "interval is already in the tree"
-                );
-            }
-            point.elements.push(Arc::clone(&item));
-        }
-        if !found_start_point {
-            // Create a new Point for the starting point
-
-            // Look at the previous point, and copy over elements that overlap with this
-            // new point
-            let mut new_elements: Vec<Arc<I>> = Vec::new();
-            if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
-                let overlapping_prev_elements = prev_point
-                    .elements
-                    .iter()
-                    .filter(|x| x.bounds().contains(&start_key))
-                    .cloned();
-
-                new_elements.extend(overlapping_prev_elements);
-            }
-            new_elements.push(item);
-
-            let new_point = Point {
-                elements: new_elements,
-            };
-            self.points.insert(start_key, new_point);
-        }
-    }
-
-    pub fn remove(&mut self, item: &Arc<I>) {
-        // range search points
-        let start_key = item.start_key();
-        let end_key = item.end_key();
-        let bounds = start_key..end_key;
-
-        let mut points_to_remove: Vec<I::Key> = Vec::new();
-        let mut found_start_point = false;
-        for (point_key, point) in self.points.range_mut(bounds) {
-            if *point_key == start_key {
-                found_start_point = true;
-            }
-            let len_before = point.elements.len();
-            point.elements.retain(|other| !Arc::ptr_eq(other, item));
-            let len_after = point.elements.len();
-            assert_eq!(len_after + 1, len_before);
-            if len_after == 0 {
-                points_to_remove.push(*point_key);
-            }
-        }
-        assert!(found_start_point);
-
-        for k in points_to_remove {
-            self.points.remove(&k).unwrap();
-        }
-    }
-}
-
-pub struct IntervalIter<'a, I: ?Sized>
-where
-    I: IntervalItem,
-{
-    point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
-    elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
-}
-
-impl<'a, I> Iterator for IntervalIter<'a, I>
-where
-    I: IntervalItem + ?Sized,
-{
-    type Item = Arc<I>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        // Iterate over all elements in all the points in 'point_iter'. To avoid
-        // returning the same element twice, we only return each element at its
-        // starting point.
-        loop {
-            // Return next remaining element from the current point
-            if let Some((point_key, elem_iter)) = &mut self.elem_iter {
-                for elem in elem_iter {
-                    if elem.start_key() == *point_key {
-                        return Some(Arc::clone(elem));
-                    }
-                }
-            }
-            // No more elements at this point. Move to next point.
-            if let Some((point_key, point)) = self.point_iter.next() {
-                self.elem_iter = Some((*point_key, point.elements.iter()));
-                continue;
-            } else {
-                // No more points, all done
-                return None;
-            }
-        }
-    }
-}
-
-impl<I: ?Sized> Default for IntervalTree<I>
-where
-    I: IntervalItem,
-{
-    fn default() -> Self {
-        IntervalTree {
-            points: BTreeMap::new(),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::fmt;
-
-    #[derive(Debug)]
-    struct MockItem {
-        start_key: u32,
-        end_key: u32,
-        val: String,
-    }
-    impl IntervalItem for MockItem {
-        type Key = u32;
-
-        fn start_key(&self) -> u32 {
-            self.start_key
-        }
-        fn end_key(&self) -> u32 {
-            self.end_key
-        }
-    }
-    impl MockItem {
-        fn new(start_key: u32, end_key: u32) -> Self {
-            MockItem {
-                start_key,
-                end_key,
-                val: format!("{}-{}", start_key, end_key),
-            }
-        }
-        fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
-            MockItem {
-                start_key,
-                end_key,
-                val: format!("{}-{}: {}", start_key, end_key, val),
-            }
-        }
-    }
-    impl fmt::Display for MockItem {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-            write!(f, "{}", self.val)
-        }
-    }
-    #[rustfmt::skip]
-    fn assert_search(
-        tree: &IntervalTree<MockItem>,
-        key: u32,
-        expected: &[&str],
-    ) -> Option<Arc<MockItem>> {
-        if let Some(v) = tree.search(key) {
-            let vstr = v.to_string();
-
-            assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
-            assert!(
-                expected.contains(&vstr.as_str()),
-                "search with {} returned {}, expected one of: {:?}",
-                key, v, expected,
-            );
-
-            Some(v)
-        } else {
-            assert!(
-                expected.is_empty(),
-                "search with {} returned None, expected one of {:?}",
-                key, expected
-            );
-            None
-        }
-    }
-
-    fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
-        let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
-        contents.sort();
-        assert_eq!(contents, expected);
-    }
-
-    fn dump_tree(tree: &IntervalTree<MockItem>) {
-        for (point_key, point) in tree.points.iter() {
-            print!("{}:", point_key);
-            for e in point.elements.iter() {
-                print!(" {}", e);
-            }
-            println!();
-        }
-    }
-
-    #[test]
-    fn test_interval_tree_simple() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Simple, non-overlapping ranges.
-        tree.insert(Arc::new(MockItem::new(10, 11)));
-        tree.insert(Arc::new(MockItem::new(11, 12)));
-        tree.insert(Arc::new(MockItem::new(12, 13)));
-        tree.insert(Arc::new(MockItem::new(18, 19)));
-        tree.insert(Arc::new(MockItem::new(17, 18)));
-        tree.insert(Arc::new(MockItem::new(15, 16)));
-
-        assert_search(&tree, 9, &[]);
-        assert_search(&tree, 10, &["10-11"]);
-        assert_search(&tree, 11, &["11-12"]);
-        assert_search(&tree, 12, &["12-13"]);
-        assert_search(&tree, 13, &["12-13"]);
-        assert_search(&tree, 14, &["12-13"]);
-        assert_search(&tree, 15, &["15-16"]);
-        assert_search(&tree, 16, &["15-16"]);
-        assert_search(&tree, 17, &["17-18"]);
-        assert_search(&tree, 18, &["18-19"]);
-        assert_search(&tree, 19, &["18-19"]);
-        assert_search(&tree, 20, &["18-19"]);
-
-        // remove a few entries and search around them again
-        tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
-        tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
-        tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
-        assert_search(&tree, 9, &[]);
-        assert_search(&tree, 10, &[]);
-        assert_search(&tree, 11, &["11-12"]);
-        assert_search(&tree, 12, &["11-12"]);
-        assert_search(&tree, 14, &["11-12"]);
-        assert_search(&tree, 15, &["15-16"]);
-        assert_search(&tree, 17, &["17-18"]);
-        assert_search(&tree, 18, &["17-18"]);
-    }
-
-    #[test]
-    fn test_interval_tree_overlap() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Overlapping items
-        tree.insert(Arc::new(MockItem::new(22, 24)));
-        tree.insert(Arc::new(MockItem::new(23, 25)));
-        let x24_26 = Arc::new(MockItem::new(24, 26));
-        tree.insert(Arc::clone(&x24_26));
-        let x26_28 = Arc::new(MockItem::new(26, 28));
-        tree.insert(Arc::clone(&x26_28));
-        tree.insert(Arc::new(MockItem::new(25, 27)));
-
-        assert_search(&tree, 22, &["22-24"]);
-        assert_search(&tree, 23, &["22-24", "23-25"]);
-        assert_search(&tree, 24, &["23-25", "24-26"]);
-        assert_search(&tree, 25, &["24-26", "25-27"]);
-        assert_search(&tree, 26, &["25-27", "26-28"]);
-        assert_search(&tree, 27, &["26-28"]);
-        assert_search(&tree, 28, &["26-28"]);
-        assert_search(&tree, 29, &["26-28"]);
-
-        tree.remove(&x24_26);
-        tree.remove(&x26_28);
-        assert_search(&tree, 23, &["22-24", "23-25"]);
-        assert_search(&tree, 24, &["23-25"]);
-        assert_search(&tree, 25, &["25-27"]);
-        assert_search(&tree, 26, &["25-27"]);
-        assert_search(&tree, 27, &["25-27"]);
-        assert_search(&tree, 28, &["25-27"]);
-        assert_search(&tree, 29, &["25-27"]);
-    }
-
-    #[test]
-    fn test_interval_tree_nested() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Items containing other items
-        tree.insert(Arc::new(MockItem::new(31, 39)));
-        tree.insert(Arc::new(MockItem::new(32, 34)));
-        tree.insert(Arc::new(MockItem::new(33, 35)));
-        tree.insert(Arc::new(MockItem::new(30, 40)));
-
-        assert_search(&tree, 30, &["30-40"]);
-        assert_search(&tree, 31, &["30-40", "31-39"]);
-        assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
-        assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
-        assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
-        assert_search(&tree, 35, &["30-40", "31-39"]);
-        assert_search(&tree, 36, &["30-40", "31-39"]);
-        assert_search(&tree, 37, &["30-40", "31-39"]);
-        assert_search(&tree, 38, &["30-40", "31-39"]);
-        assert_search(&tree, 39, &["30-40"]);
-        assert_search(&tree, 40, &["30-40"]);
-        assert_search(&tree, 41, &["30-40"]);
-    }
-
-    #[test]
-    fn test_interval_tree_duplicates() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Duplicate keys
-        let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
-        tree.insert(Arc::clone(&item_a));
-        let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
-        tree.insert(Arc::clone(&item_b));
-        let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
-        tree.insert(Arc::clone(&item_c));
-        let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
-        tree.insert(Arc::clone(&item_d));
-        let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
-        tree.insert(Arc::clone(&item_e));
-
-        dump_tree(&tree);
-
-        assert_search(
-            &tree,
-            55,
-            &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
-        );
-        tree.remove(&item_b);
-        dump_tree(&tree);
-
-        assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
-
-        tree.remove(&item_d);
-        dump_tree(&tree);
-        assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_interval_tree_insert_twice() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Inserting the same item twice is not cool
-        let item = Arc::new(MockItem::new(1, 2));
-        tree.insert(Arc::clone(&item));
-        tree.insert(Arc::clone(&item)); // fails assertion
-    }
-}
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -3,30 +3,27 @@
 //!
 //! When the timeline is first accessed, the server lists of all layer files
 //! in the timelines/<timelineid> directory, and populates this map with
-//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL
-//! is received, we create InMemoryLayers to hold the incoming records. Now and
-//! then, in the checkpoint() function, the in-memory layers are frozen, forming
-//! new image and delta layers and corresponding files are written to disk.
+//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
+//! new WAL record is received, we create an InMemoryLayer to hold the incoming
+//! records. Now and then, in the checkpoint() function, the in-memory layer is
+//! are frozen, and it is split up into new image and delta layers and the
+//! corresponding files are written to disk.
 //!

-use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag};
+use crate::layered_repository::storage_layer::Layer;
+use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
 use crate::layered_repository::InMemoryLayer;
-use crate::relish::*;
+use crate::repository::Key;
 use anyhow::Result;
 use lazy_static::lazy_static;
-use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::VecDeque;
+use std::ops::Range;
 use std::sync::Arc;
+use tracing::*;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;

-use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
-
 lazy_static! {
-    static ref NUM_INMEMORY_LAYERS: IntGauge =
-        register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
-            .expect("failed to define a metric");
    static ref NUM_ONDISK_LAYERS: IntGauge =
        register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
            .expect("failed to define a metric");
@@ -37,98 +34,135 @@ lazy_static! {
 ///
 #[derive(Default)]
 pub struct LayerMap {
-    /// All the layers keyed by segment tag
-    segs: HashMap<SegmentTag, SegEntry>,
+    //
+    // 'open_layer' holds the current InMemoryLayer that is accepting new
+    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
+    // where the start LSN of the next InMemoryLayer that is to be created.
+    //
+    pub open_layer: Option<Arc<InMemoryLayer>>,
+    pub next_open_layer_at: Option<Lsn>,

-    /// All in-memory layers, ordered by 'oldest_lsn' and generation
-    /// of each layer. This allows easy access to the in-memory layer that
-    /// contains the oldest WAL record.
-    open_layers: BinaryHeap<OpenLayerEntry>,
+    ///
+    /// The frozen layer, if any, contains WAL older than the current 'open_layer'
+    /// or 'next_open_layer_at', but newer than any historic layer. The frozen
+    /// layer is during checkpointing, when an InMemoryLayer is being written out
+    /// to disk.
+    ///
+    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,

-    /// Generation number, used to distinguish newly inserted entries in the
-    /// binary heap from older entries during checkpoint.
-    current_generation: u64,
+    /// All the historic layers are kept here
+
+    /// TODO: This is a placeholder implementation of a data structure
+    /// to hold information about all the layer files on disk and in
+    /// S3. Currently, it's just a vector and all operations perform a
+    /// linear scan over it.  That obviously becomes slow as the
+    /// number of layers grows. I'm imagining that an R-tree or some
+    /// other 2D data structure would be the long-term solution here.
+    historic_layers: Vec<Arc<dyn Layer>>,
+}
+
+pub struct SearchResult {
+    pub layer: Arc<dyn Layer>,
+    pub lsn_floor: Lsn,
 }

 impl LayerMap {
-    ///
-    /// Look up a layer using the given segment tag and LSN. This differs from a
-    /// plain key-value lookup in that if there is any layer that covers the
-    /// given LSN, or precedes the given LSN, it is returned. In other words,
-    /// you don't need to know the exact start LSN of the layer.
-    ///
-    pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        let segentry = self.segs.get(tag)?;
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
+        // linear search
+        // Find the latest image layer that covers the given key
+        let mut latest_img: Option<Arc<dyn Layer>> = None;
+        let mut latest_img_lsn: Option<Lsn> = None;
+        for l in self.historic_layers.iter() {
+            if l.is_incremental() {
+                continue;
+            }
+            if !l.get_key_range().contains(&key) {
+                continue;
+            }
+            let img_lsn = l.get_lsn_range().start;

-        segentry.get(lsn)
-    }
+            if img_lsn >= end_lsn {
+                // too new
+                continue;
+            }
+            if Lsn(img_lsn.0 + 1) == end_lsn {
+                // found exact match
+                return Ok(Some(SearchResult {
+                    layer: Arc::clone(l),
+                    lsn_floor: img_lsn,
+                }));
+            }
+            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
+                latest_img = Some(Arc::clone(l));
+                latest_img_lsn = Some(img_lsn);
+            }
+        }

-    ///
-    /// Get the open layer for given segment for writing. Or None if no open
-    /// layer exists.
-    ///
-    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
-        let segentry = self.segs.get(tag)?;
-
-        segentry
-            .open_layer_id
-            .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
-    }
-
-    ///
-    /// Insert an open in-memory layer
-    ///
-    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
-        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
-
-        let layer_id = segentry.update_open(Arc::clone(&layer));
-
-        let oldest_lsn = layer.get_oldest_lsn();
-
-        // After a crash and restart, 'oldest_lsn' of the oldest in-memory
-        // layer becomes the WAL streaming starting point, so it better not point
-        // in the middle of a WAL record.
-        assert!(oldest_lsn.is_aligned());
-
-        // Also add it to the binary heap
-        let open_layer_entry = OpenLayerEntry {
-            oldest_lsn: layer.get_oldest_lsn(),
-            layer_id,
-            generation: self.current_generation,
-        };
-        self.open_layers.push(open_layer_entry);
-
-        NUM_INMEMORY_LAYERS.inc();
-    }
-
-    /// Remove an open in-memory layer
-    pub fn remove_open(&mut self, layer_id: LayerId) {
-        // Note: we don't try to remove the entry from the binary heap.
-        // It will be removed lazily by peek_oldest_open() when it's made it to
-        // the top of the heap.
-
-        let layer_opt = {
-            let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
-            let layer_opt = global_map.get(&layer_id);
-            global_map.remove(&layer_id);
-            // TODO it's bad that a ref can still exist after being evicted from cache
-            layer_opt
-        };
-
-        if let Some(layer) = layer_opt {
-            let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
-
-            if segentry.open_layer_id == Some(layer_id) {
-                // Also remove it from the SegEntry of this segment
-                segentry.open_layer_id = None;
-            } else {
-                // We could have already updated segentry.open for
-                // dropped (non-writeable) layer. This is fine.
-                assert!(!layer.is_writeable());
-                assert!(layer.is_dropped());
+        // Search the delta layers
+        let mut latest_delta: Option<Arc<dyn Layer>> = None;
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if !l.get_key_range().contains(&key) {
+                continue;
            }

-            NUM_INMEMORY_LAYERS.dec();
+            if l.get_lsn_range().start >= end_lsn {
+                // too new
+                continue;
+            }
+
+            if l.get_lsn_range().end >= end_lsn {
+                // this layer contains the requested point in the key/lsn space.
+                // No need to search any further
+                trace!(
+                    "found layer {} for request on {} at {}",
+                    l.filename().display(),
+                    key,
+                    end_lsn
+                );
+                latest_delta.replace(Arc::clone(l));
+                break;
+            }
+            // this layer's end LSN is smaller than the requested point. If there's
+            // nothing newer, this is what we need to return. Remember this.
+            if let Some(ref old_candidate) = latest_delta {
+                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                    latest_delta.replace(Arc::clone(l));
+                }
+            } else {
+                latest_delta.replace(Arc::clone(l));
+            }
+        }
+        if let Some(l) = latest_delta {
+            trace!(
+                "found (old) layer {} for request on {} at {}",
+                l.filename().display(),
+                key,
+                end_lsn
+            );
+            let lsn_floor = std::cmp::max(
+                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
+                l.get_lsn_range().start,
+            );
+            Ok(Some(SearchResult {
+                lsn_floor,
+                layer: l,
+            }))
+        } else if let Some(l) = latest_img {
+            trace!(
+                "found img layer and no deltas for request on {} at {}",
+                key,
+                end_lsn
+            );
+            Ok(Some(SearchResult {
+                lsn_floor: latest_img_lsn.unwrap(),
+                layer: l,
+            }))
+        } else {
+            trace!("no layer found for request on {} at {}", key, end_lsn);
+            Ok(None)
        }
    }

@@ -136,9 +170,7 @@ impl LayerMap {
    /// Insert an on-disk layer
    ///
    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
-        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
-        segentry.insert_historic(layer);
-
+        self.historic_layers.push(layer);
        NUM_ONDISK_LAYERS.inc();
    }

@@ -147,61 +179,62 @@ impl LayerMap {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
+    #[allow(dead_code)]
    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
-        let tag = layer.get_seg_tag();
+        let len_before = self.historic_layers.len();

-        if let Some(segentry) = self.segs.get_mut(&tag) {
-            segentry.historic.remove(&layer);
-        }
+        // FIXME: ptr_eq might fail to return true for 'dyn'
+        // references.  Clippy complains about this. In practice it
+        // seems to work, the assertion below would be triggered
+        // otherwise but this ought to be fixed.
+        #[allow(clippy::vtable_address_comparisons)]
+        self.historic_layers
+            .retain(|other| !Arc::ptr_eq(other, &layer));
+
+        assert_eq!(self.historic_layers.len(), len_before - 1);
        NUM_ONDISK_LAYERS.dec();
    }

-    // List relations along with a flag that marks if they exist at the given lsn.
-    // spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
-    // Pass Tag if we're only interested in some relations.
-    pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
-        let mut rels: HashMap<RelishTag, bool> = HashMap::new();
-
-        for (seg, segentry) in self.segs.iter() {
-            match seg.rel {
-                RelishTag::Relation(reltag) => {
-                    if let Some(request_rel) = tag {
-                        if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
-                            && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
-                        {
-                            if let Some(exists) = segentry.exists_at_lsn(lsn)? {
-                                rels.insert(seg.rel, exists);
-                            }
-                        }
-                    }
-                }
-                _ => {
-                    if tag == None {
-                        if let Some(exists) = segentry.exists_at_lsn(lsn)? {
-                            rels.insert(seg.rel, exists);
-                        }
-                    }
-                }
-            }
-        }
-        Ok(rels)
-    }
-
    /// Is there a newer image layer for given segment?
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
    /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
+    /// We also only look at historic layers
+    //#[allow(dead_code)]
    pub fn newer_image_layer_exists(
        &self,
-        seg: SegmentTag,
+        key_range: &Range<Key>,
        lsn: Lsn,
        disk_consistent_lsn: Lsn,
-    ) -> bool {
-        if let Some(segentry) = self.segs.get(&seg) {
-            segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
-        } else {
-            false
+    ) -> Result<bool> {
+        let mut range_remain = key_range.clone();
+
+        loop {
+            let mut made_progress = false;
+            for l in self.historic_layers.iter() {
+                if l.is_incremental() {
+                    continue;
+                }
+                let img_lsn = l.get_lsn_range().start;
+                if !l.is_incremental()
+                    && l.get_key_range().contains(&range_remain.start)
+                    && img_lsn > lsn
+                    && img_lsn < disk_consistent_lsn
+                {
+                    made_progress = true;
+                    let img_key_end = l.get_key_range().end;
+
+                    if img_key_end >= range_remain.end {
+                        return Ok(true);
+                    }
+                    range_remain.start = img_key_end;
+                }
+            }
+
+            if !made_progress {
+                return Ok(false);
+            }
        }
    }

@@ -211,284 +244,144 @@ impl LayerMap {
    /// used for garbage collection, to determine if some alive layer
    /// exists at the lsn. If so, we shouldn't delete a newer dropped layer
    /// to avoid incorrectly making it visible.
-    pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
-        Ok(if let Some(segentry) = self.segs.get(&seg) {
-            segentry.exists_at_lsn(lsn)?.unwrap_or(false)
-        } else {
-            false
-        })
+    /*
+        pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
+            Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
+                segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
+            } else {
+                false
+            })
+        }
+    */
+
+    pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
+        self.historic_layers.iter()
    }

-    /// Return the oldest in-memory layer, along with its generation number.
-    pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
-        let global_map = GLOBAL_LAYER_MAP.read().unwrap();
+    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
+        // Find the last image layer that covers the key
+        let mut candidate_lsn = Lsn(0);
+        let mut candidate = None;
+        for l in self.historic_layers.iter() {
+            if l.is_incremental() {
+                continue;
+            }

-        while let Some(oldest_entry) = self.open_layers.peek() {
-            if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
-                return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
-            } else {
-                self.open_layers.pop();
+            if !l.get_key_range().contains(&key) {
+                continue;
+            }
+
+            let this_lsn = l.get_lsn_range().start;
+            if this_lsn > lsn {
+                continue;
+            }
+            if this_lsn < candidate_lsn {
+                // our previous candidate was better
+                continue;
+            }
+            candidate_lsn = this_lsn;
+            candidate = Some(Arc::clone(l));
+        }
+
+        candidate
+    }
+
+    ///
+    /// Divide the whole given range of keys into sub-ranges based on the latest
+    /// image layer that covers each range. (This is used when creating  new
+    /// image layers)
+    ///
+    // FIXME: clippy complains that the result type is very complex. She's probably
+    // right...
+    #[allow(clippy::type_complexity)]
+    pub fn image_coverage(
+        &self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
+        let mut points: Vec<Key>;
+
+        points = vec![key_range.start];
+        for l in self.historic_layers.iter() {
+            if l.get_lsn_range().start > lsn {
+                continue;
+            }
+            let range = l.get_key_range();
+            if key_range.contains(&range.start) {
+                points.push(l.get_key_range().start);
+            }
+            if key_range.contains(&range.end) {
+                points.push(l.get_key_range().end);
            }
        }
-        None
-    }
+        points.push(key_range.end);

-    /// Increment the generation number used to stamp open in-memory layers. Layers
-    /// added with `insert_open` after this call will be associated with the new
-    /// generation. Returns the new generation number.
-    pub fn increment_generation(&mut self) -> u64 {
-        self.current_generation += 1;
-        self.current_generation
-    }
+        points.sort();
+        points.dedup();

-    pub fn iter_historic_layers(&self) -> HistoricLayerIter {
-        HistoricLayerIter {
-            seg_iter: self.segs.iter(),
-            iter: None,
+        // Ok, we now have a list of "interesting" points in the key space
+
+        // For each range between the points, find the latest image
+        let mut start = *points.first().unwrap();
+        let mut ranges = Vec::new();
+        for end in points[1..].iter() {
+            let img = self.find_latest_image(start, lsn);
+
+            ranges.push((start..*end, img));
+
+            start = *end;
        }
+        Ok(ranges)
+    }
+
+    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
+        let mut result = 0;
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if !range_overlaps(&l.get_lsn_range(), lsn_range) {
+                continue;
+            }
+            if !range_overlaps(&l.get_key_range(), key_range) {
+                continue;
+            }
+
+            // We ignore level0 delta layers. Unless the whole keyspace fits
+            // into one partition
+            if !range_eq(key_range, &(Key::MIN..Key::MAX))
+                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
+            {
+                continue;
+            }
+
+            result += 1;
+        }
+        Ok(result)
+    }
+
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
+        let mut deltas = Vec::new();
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if l.get_key_range() != (Key::MIN..Key::MAX) {
+                continue;
+            }
+            deltas.push(Arc::clone(l));
+        }
+        Ok(deltas)
    }

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
    pub fn dump(&self) -> Result<()> {
        println!("Begin dump LayerMap");
-        for (seg, segentry) in self.segs.iter() {
-            if let Some(open) = &segentry.open_layer_id {
-                if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
-                    layer.dump()?;
-                } else {
-                    println!("layer not found in global map");
-                }
-            }
-
-            for layer in segentry.historic.iter() {
-                layer.dump()?;
-            }
+        for layer in self.historic_layers.iter() {
+            layer.dump()?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-impl IntervalItem for dyn Layer {
-    type Key = Lsn;
-
-    fn start_key(&self) -> Lsn {
-        self.get_start_lsn()
-    }
-    fn end_key(&self) -> Lsn {
-        self.get_end_lsn()
-    }
-}
-
-///
-/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
-/// associated with the segment.
-///
-/// The last layer that is open for writes is always an InMemoryLayer,
-/// and is kept in a separate field, because there can be only one for
-/// each segment. The older layers, stored on disk, are kept in an
-/// IntervalTree.
-#[derive(Default)]
-struct SegEntry {
-    open_layer_id: Option<LayerId>,
-    historic: IntervalTree<dyn Layer>,
-}
-
-impl SegEntry {
-    /// Does the segment exist at given LSN?
-    /// Return None if object is not found in this SegEntry.
-    fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
-        if let Some(layer) = self.get(lsn) {
-            Ok(Some(layer.get_seg_exists(lsn)?))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        if let Some(open_layer_id) = &self.open_layer_id {
-            let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
-            if open_layer.get_start_lsn() <= lsn {
-                return Some(open_layer);
-            }
-        }
-
-        self.historic.search(lsn)
-    }
-
-    pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
-        // We only check on-disk layers, because
-        // in-memory layers are not durable
-
-        // The end-LSN is exclusive, while disk_consistent_lsn is
-        // inclusive. For example, if disk_consistent_lsn is 100, it is
-        // OK for a delta layer to have end LSN 101, but if the end LSN
-        // is 102, then it might not have been fully flushed to disk
-        // before crash.
-        self.historic
-            .iter_newer(lsn)
-            .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
-    }
-
-    // Set new open layer for a SegEntry.
-    // It's ok to rewrite previous open layer,
-    // but only if it is not writeable anymore.
-    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
-        if let Some(prev_open_layer_id) = &self.open_layer_id {
-            if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
-            {
-                assert!(!prev_open_layer.is_writeable());
-            }
-        }
-        let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
-        self.open_layer_id = Some(open_layer_id);
-        open_layer_id
-    }
-
-    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
-        self.historic.insert(layer);
-    }
-}
-
-/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
-/// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
-///
-/// The generation number associated with each entry can be used to distinguish
-/// recently-added entries (i.e after last call to increment_generation()) from older
-/// entries with the same 'oldest_lsn'.
-struct OpenLayerEntry {
-    oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
-    generation: u64,
-    layer_id: LayerId,
-}
-impl Ord for OpenLayerEntry {
-    fn cmp(&self, other: &Self) -> Ordering {
-        // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
-        // to get that. Entries with identical oldest_lsn are ordered by generation
-        other
-            .oldest_lsn
-            .cmp(&self.oldest_lsn)
-            .then_with(|| other.generation.cmp(&self.generation))
-    }
-}
-impl PartialOrd for OpenLayerEntry {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-impl PartialEq for OpenLayerEntry {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-impl Eq for OpenLayerEntry {}
-
-/// Iterator returned by LayerMap::iter_historic_layers()
-pub struct HistoricLayerIter<'a> {
-    seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
-    iter: Option<IntervalIter<'a, dyn Layer>>,
-}
-
-impl<'a> Iterator for HistoricLayerIter<'a> {
-    type Item = Arc<dyn Layer>;
-
-    fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
-        loop {
-            if let Some(x) = &mut self.iter {
-                if let Some(x) = x.next() {
-                    return Some(Arc::clone(&x));
-                }
-            }
-            if let Some((_tag, segentry)) = self.seg_iter.next() {
-                self.iter = Some(segentry.historic.iter());
-                continue;
-            } else {
-                return None;
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::config::PageServerConf;
-    use std::str::FromStr;
-    use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-    /// Arbitrary relation tag, for testing.
-    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
-        spcnode: 0,
-        dbnode: 111,
-        relnode: 1000,
-        forknum: 0,
-    });
-
-    lazy_static! {
-        static ref DUMMY_TIMELINEID: ZTimelineId =
-            ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
-        static ref DUMMY_TENANTID: ZTenantId =
-            ZTenantId::from_str("00000000000000000000000000000000").unwrap();
-    }
-
-    /// Construct a dummy InMemoryLayer for testing
-    fn dummy_inmem_layer(
-        conf: &'static PageServerConf,
-        segno: u32,
-        start_lsn: Lsn,
-        oldest_lsn: Lsn,
-    ) -> Arc<InMemoryLayer> {
-        Arc::new(
-            InMemoryLayer::create(
-                conf,
-                *DUMMY_TIMELINEID,
-                *DUMMY_TENANTID,
-                SegmentTag {
-                    rel: TESTREL_A,
-                    segno,
-                },
-                start_lsn,
-                oldest_lsn,
-            )
-            .unwrap(),
-        )
-    }
-
-    #[test]
-    fn test_open_layers() -> Result<()> {
-        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
-        let conf = Box::leak(Box::new(conf));
-        std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
-
-        let mut layers = LayerMap::default();
-
-        let gen1 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
-        layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
-        layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
-        layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
-
-        let gen2 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
-        layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
-
-        // A helper function (closure) to pop the next oldest open entry from the layer map,
-        // and assert that it is what we'd expect
-        let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
-            let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
-            assert!(l.get_seg_tag().segno == expected_segno);
-            assert!(generation == expected_generation);
-            layers.remove_open(layer_id);
-        };
-
-        assert_pop_layer(0, gen1); // 0x100
-        assert_pop_layer(5, gen2); // 0x100
-        assert_pop_layer(3, gen1); // 0x110
-        assert_pop_layer(4, gen2); // 0x110
-        assert_pop_layer(2, gen1); // 0x120
-        assert_pop_layer(1, gen1); // 0x200
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -2,117 +2,85 @@
 //! Common traits and structs for layers
 //!

-use crate::relish::RelishTag;
-use crate::repository::{BlockNumber, ZenithWalRecord};
+use crate::repository::{Key, Value};
+use crate::walrecord::ZenithWalRecord;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::ops::Range;
 use std::path::PathBuf;

 use zenith_utils::lsn::Lsn;

-// Size of one segment in pages (10 MB)
-pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
-
-///
-/// Each relish stored in the repository is divided into fixed-sized "segments",
-/// with 10 MB of key-space, or 1280 8k pages each.
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
-pub struct SegmentTag {
-    pub rel: RelishTag,
-    pub segno: u32,
-}
-
-/// SegmentBlk represents a block number within a segment, or the size of segment.
-///
-/// This is separate from BlockNumber, which is used for block number within the
-/// whole relish. Since this is just a type alias, the compiler will let you mix
-/// them freely, but we use the type alias as documentation to make it clear
-/// which one we're dealing with.
-///
-/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally
-/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes
-/// operations more verbose).
-pub type SegmentBlk = u32;
-
-impl fmt::Display for SegmentTag {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}.{}", self.rel, self.segno)
+pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
+where
+    T: PartialOrd<T>,
+{
+    if a.start < b.start {
+        a.end > b.start
+    } else {
+        b.end > a.start
    }
 }

-impl SegmentTag {
-    /// Given a relish and block number, calculate the corresponding segment and
-    /// block number within the segment.
-    pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) {
-        (
-            SegmentTag {
-                rel,
-                segno: blknum / RELISH_SEG_SIZE,
-            },
-            blknum % RELISH_SEG_SIZE,
-        )
-    }
+pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
+where
+    T: PartialEq<T>,
+{
+    a.start == b.start && a.end == b.end
 }

+/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
 ///
-/// Represents a version of a page at a specific LSN. The LSN is the key of the
-/// entry in the 'page_versions' hash, it is not duplicated here.
+/// Before first call, you can fill in 'page_img' if you have an older cached
+/// version of the page available. That can save work in
+/// 'get_value_reconstruct_data', as it can stop searching for page versions
+/// when all the WAL records going back to the cached image have been collected.
 ///
-/// A page version can be stored as a full page image, or as WAL record that needs
-/// to be applied over the previous page version to reconstruct this version.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PageVersion {
-    Page(Bytes),
-    Wal(ZenithWalRecord),
-}
-
-///
-/// Struct used to communicate across calls to 'get_page_reconstruct_data'.
-///
-/// Before first call to get_page_reconstruct_data, you can fill in 'page_img'
-/// if you have an older cached version of the page available. That can save
-/// work in 'get_page_reconstruct_data', as it can stop searching for page
-/// versions when all the WAL records going back to the cached image have been
-/// collected.
-///
-/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an
-/// image of the page, or the oldest WAL record in 'records' is a will_init-type
+/// When get_value_reconstruct_data returns Complete, 'img' is set to an image
+/// of the page, or the oldest WAL record in 'records' is a will_init-type
 /// record that initializes the page without requiring a previous image.
 ///
 /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
 /// been collected, but there are more records outside the current layer. Pass
-/// the same PageReconstructData struct in the next 'get_page_reconstruct_data'
+/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-pub struct PageReconstructData {
+#[derive(Debug)]
+pub struct ValueReconstructState {
    pub records: Vec<(Lsn, ZenithWalRecord)>,
-    pub page_img: Option<(Lsn, Bytes)>,
+    pub img: Option<(Lsn, Bytes)>,
 }

 /// Return value from Layer::get_page_reconstruct_data
-pub enum PageReconstructResult {
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
    Complete,
    /// This layer didn't contain all the required data, the caller should look up
    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
+    Continue,
+
    /// This layer didn't contain data needed to reconstruct the page version at
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
-    Missing(Lsn),
+    Missing,
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
 ///
-/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
 /// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access
-/// to the recent page versions. On-disk layers are stored as files on disk, and
-/// are immutable. This trait presents the common functionality of
-/// in-memory and on-disk layers.
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN
 ///
 pub trait Layer: Send + Sync {
    fn get_tenant_id(&self) -> ZTenantId;
@@ -120,21 +88,16 @@ pub trait Layer: Send + Sync {
    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

-    /// Identify the relish segment
-    fn get_seg_tag(&self) -> SegmentTag;
+    /// Range of segments that this layer covers
+    fn get_key_range(&self) -> Range<Key>;

    /// Inclusive start bound of the LSN range that this layer holds
-    fn get_start_lsn(&self) -> Lsn;
-
    /// Exclusive end bound of the LSN range that this layer holds.
    ///
    /// - For an open in-memory layer, this is MAX_LSN.
    /// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
-    fn get_end_lsn(&self) -> Lsn;
-
-    /// Is the segment represented by this layer dropped by PostgreSQL?
-    fn is_dropped(&self) -> bool;
+    fn get_lsn_range(&self) -> Range<Lsn>;

    /// Filename used to store this layer on disk. (Even in-memory layers
    /// implement this, to print a handy unique identifier for the layer for
@@ -153,18 +116,12 @@ pub trait Layer: Send + Sync {
    /// is available. If this returns PageReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    fn get_page_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
-        blknum: SegmentBlk,
-        lsn: Lsn,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult>;
-
-    /// Return size of the segment at given LSN. (Only for blocky relations.)
-    fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk>;
-
-    /// Does the segment exist at given LSN? Or was it dropped before it.
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult>;

    /// Does this layer only contain some data for the segment (incremental),
    /// or does it contain a version of every page? This is important to know
@@ -175,6 +132,9 @@ pub trait Layer: Send + Sync {
    /// Returns true for layers that are represented in memory.
    fn is_in_memory(&self) -> bool;

+    /// Iterate through all keys and values stored in the layer
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+
    /// Release memory used by this layer. There is no corresponding 'load'
    /// function, that's done implicitly when you call one of the get-functions.
    fn unload(&self) -> Result<()>;
@@ -185,3 +145,36 @@ pub trait Layer: Send + Sync {
    /// Dump summary of the contents of the layer to stdout
    fn dump(&self) -> Result<()>;
 }
+
+// Flag indicating that this version initialize the page
+const WILL_INIT: u64 = 1;
+
+///
+/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size.
+/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records
+/// which needs to be applied without reading/deserializing records themselves.
+///
+#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
+pub struct BlobRef(u64);
+
+impl BlobRef {
+    pub fn will_init(&self) -> bool {
+        (self.0 & WILL_INIT) != 0
+    }
+
+    pub fn pos(&self) -> u64 {
+        self.0 >> 32
+    }
+
+    pub fn size(&self) -> usize {
+        ((self.0 & 0xFFFFFFFF) >> 1) as usize
+    }
+
+    pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef {
+        let mut blob_ref = (pos << 32) | ((size as u64) << 1);
+        if will_init {
+            blob_ref |= WILL_INIT;
+        }
+        BlobRef(blob_ref)
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -2,9 +2,11 @@ pub mod basebackup;
 pub mod config;
 pub mod http;
 pub mod import_datadir;
+pub mod keyspace;
 pub mod layered_repository;
 pub mod page_cache;
 pub mod page_service;
+pub mod pgdatadir_mapping;
 pub mod relish;
 pub mod remote_storage;
 pub mod repository;
@@ -22,6 +24,9 @@ use lazy_static::lazy_static;
 use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

+use layered_repository::LayeredRepository;
+use pgdatadir_mapping::DatadirTimeline;
+
 lazy_static! {
    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
        "pageserver_live_connections_count",
@@ -36,10 +41,12 @@ pub const LOG_FILE_NAME: &str = "pageserver.log";
 /// Config for the Repository checkpointer
 #[derive(Debug, Clone, Copy)]
 pub enum CheckpointConfig {
-    // Flush in-memory data that is older than this
-    Distance(u64),
    // Flush all in-memory data
    Flush,
    // Flush all in-memory data and reconstruct all page images
    Forced,
 }
+
+pub type RepositoryImpl = LayeredRepository;
+
+pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -52,8 +52,9 @@ use zenith_utils::{
    zid::{ZTenantId, ZTimelineId},
 };

+use crate::config::PageServerConf;
 use crate::layered_repository::writeback_ephemeral_file;
-use crate::{config::PageServerConf, relish::RelTag};
+use crate::repository::Key;

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 10;
@@ -108,8 +109,7 @@ enum CacheKey {
 struct MaterializedPageHashKey {
    tenant_id: ZTenantId,
    timeline_id: ZTimelineId,
-    rel_tag: RelTag,
-    blknum: u32,
+    key: Key,
 }

 #[derive(Clone)]
@@ -294,16 +294,14 @@ impl PageCache {
        &self,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
-        rel_tag: RelTag,
-        blknum: u32,
+        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
                timeline_id,
-                rel_tag,
-                blknum,
+                key: *key,
            },
            lsn,
        };
@@ -326,8 +324,7 @@ impl PageCache {
        &self,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
-        rel_tag: RelTag,
-        blknum: u32,
+        key: Key,
        lsn: Lsn,
        img: &[u8],
    ) {
@@ -335,8 +332,7 @@ impl PageCache {
            hash_key: MaterializedPageHashKey {
                tenant_id,
                timeline_id,
-                rel_tag,
-                blknum,
+                key,
            },
            lsn,
        };
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -32,7 +32,9 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::basebackup;
 use crate::config::PageServerConf;
+use crate::pgdatadir_mapping::DatadirTimeline;
 use crate::relish::*;
+use crate::repository::Repository;
 use crate::repository::Timeline;
 use crate::tenant_mgr;
 use crate::thread_mgr;
@@ -298,7 +300,7 @@ lazy_static! {
    static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
        "pageserver_smgr_query_time",
        "Time spent on smgr query handling",
-        &["smgr_query_type"],
+        &["smgr_query_type", "tenant_id", "timeline_id"],
        TIME_BUCKETS.into()
    )
    .expect("failed to define a metric");
@@ -340,20 +342,22 @@ impl PageServerHandler {
                        };

                        let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+                        let tenant_id = tenantid.to_string();
+                        let timeline_id = timelineid.to_string();

                        let response = match zenith_fe_msg {
                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_exists"])
+                                .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_rel_exists_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_size"])
+                                .with_label_values(&["get_rel_size", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_nblocks_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_page_at_lsn"])
+                                .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
@@ -395,8 +399,8 @@ impl PageServerHandler {
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    fn wait_or_get_last_lsn(
-        timeline: &dyn Timeline,
+    fn wait_or_get_last_lsn<R: Repository>(
+        timeline: &DatadirTimeline<R>,
        mut lsn: Lsn,
        latest: bool,
        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
@@ -423,7 +427,7 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline.wait_lsn(lsn)?;
+                timeline.tline.wait_lsn(lsn)?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -433,7 +437,7 @@ impl PageServerHandler {
            if lsn == Lsn(0) {
                bail!("invalid LSN(0) in request");
            }
-            timeline.wait_lsn(lsn)?;
+            timeline.tline.wait_lsn(lsn)?;
        }
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
@@ -443,54 +447,47 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    fn handle_get_rel_exists_request(
+    fn handle_get_rel_exists_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();

-        let tag = RelishTag::Relation(req.rel);
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

-        let exists = timeline.get_rel_exists(tag, lsn)?;
+        let exists = timeline.get_rel_exists(req.rel, lsn)?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
            exists,
        }))
    }

-    fn handle_get_nblocks_request(
+    fn handle_get_nblocks_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
-        let tag = RelishTag::Relation(req.rel);
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

-        let n_blocks = timeline.get_relish_size(tag, lsn)?;
-
-        // Return 0 if relation is not found.
-        // This is what postgres smgr expects.
-        let n_blocks = n_blocks.unwrap_or(0);
+        let n_blocks = timeline.get_rel_size(req.rel, lsn)?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
        }))
    }

-    fn handle_get_page_at_lsn_request(
+    fn handle_get_page_at_lsn_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
            .entered();
-        let tag = RelishTag::Relation(req.rel);
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
        /*
        // Add a 1s delay to some requests. The delayed causes the requests to
@@ -500,7 +497,7 @@ impl PageServerHandler {
            std::thread::sleep(std::time::Duration::from_millis(1000));
        }
        */
-        let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
+        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -520,7 +517,7 @@ impl PageServerHandler {
        // check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
            .context("Cannot handle basebackup request for a remote timeline")?;
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
@@ -699,67 +696,19 @@ impl postgres_backend::Handler for PageServerHandler {
            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"layer_relfiles_total"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
-                RowDescriptor::int8_col(b"layer_relfiles_removed"),
-                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
+                RowDescriptor::int8_col(b"layers_total"),
+                RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layers_needed_by_branches"),
+                RowDescriptor::int8_col(b"layers_not_updated"),
+                RowDescriptor::int8_col(b"layers_removed"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(result.ondisk_relfiles_total.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_by_cutoff
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_by_branches
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_as_tombstone
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
-                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
-                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_by_cutoff
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_by_branches
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_as_tombstone
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
-                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
+                Some(result.layers_total.to_string().as_bytes()),
+                Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
+                Some(result.layers_needed_by_branches.to_string().as_bytes()),
+                Some(result.layers_not_updated.to_string().as_bytes()),
+                Some(result.layers_removed.to_string().as_bytes()),
                Some(result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -779,7 +728,14 @@ impl postgres_backend::Handler for PageServerHandler {
            let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
                .context("Failed to fetch local timeline for checkpoint request")?;

-            timeline.checkpoint(CheckpointConfig::Forced)?;
+            timeline.tline.checkpoint(CheckpointConfig::Forced)?;
+
+            // Also compact it.
+            //
+            // FIXME: This probably shouldn't be part of a "checkpoint" command, but a
+            // separate operation. Update the tests if you change this.
+            timeline.tline.compact()?;
+
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -1,133 +1,9 @@
-//!
-//! Zenith stores PostgreSQL relations, and some other files, in the
-//! repository.  The relations (i.e. tables and indexes) take up most
-//! of the space in a typical installation, while the other files are
-//! small. We call each relation and other file that is stored in the
-//! repository a "relish". It comes from "rel"-ish, as in "kind of a
-//! rel", because it covers relations as well as other things that are
-//! not relations, but are treated similarly for the purposes of the
-//! storage layer.
-//!
-//! This source file contains the definition of the RelishTag struct,
-//! which uniquely identifies a relish.
-//!
-//! Relishes come in two flavors: blocky and non-blocky. Relations and
-//! SLRUs are blocky, that is, they are divided into 8k blocks, and
-//! the repository tracks their size. Other relishes are non-blocky:
-//! the content of the whole relish is stored as one blob. Block
-//! number must be passed as 0 for all operations on a non-blocky
-//! relish. The one "block" that you store in a non-blocky relish can
-//! have arbitrary size, but they are expected to be small, or you
-//! will have performance issues.
-//!
-//! All relishes are versioned by LSN in the repository.
-//!
-
 use serde::{Deserialize, Serialize};
+use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::relfile_utils::forknumber_to_name;
-use postgres_ffi::{Oid, TransactionId};
-
-///
-/// RelishTag identifies one relish.
-///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub enum RelishTag {
-    // Relations correspond to PostgreSQL relation forks. Each
-    // PostgreSQL relation fork is considered a separate relish.
-    Relation(RelTag),
-
-    // SLRUs include pg_clog, pg_multixact/members, and
-    // pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
-    // they don't need to be stored permanently (e.g. pg_subtrans),
-    // or we do not support them in zenith yet (pg_commit_ts).
-    //
-    // These are currently never requested directly by the compute
-    // nodes, although in principle that would be possible. However,
-    // when a new compute node is created, these are included in the
-    // tarball that we send to the compute node to initialize the
-    // PostgreSQL data directory.
-    //
-    // Each SLRU segment in PostgreSQL is considered a separate
-    // relish. For example, pg_clog/0000, pg_clog/0001, and so forth.
-    //
-    // SLRU segments are divided into blocks, like relations.
-    Slru { slru: SlruKind, segno: u32 },
-
-    // Miscellaneous other files that need to be included in the
-    // tarball at compute node creation. These are non-blocky, and are
-    // expected to be small.
-
-    //
-    // FileNodeMap represents PostgreSQL's 'pg_filenode.map'
-    // files. They are needed to map catalog table OIDs to filenode
-    // numbers. Usually the mapping is done by looking up a relation's
-    // 'relfilenode' field in the 'pg_class' system table, but that
-    // doesn't work for 'pg_class' itself and a few other such system
-    // relations. See PostgreSQL relmapper.c for details.
-    //
-    // Each database has a map file for its local mapped catalogs,
-    // and there is a separate map file for shared catalogs.
-    //
-    // These files are always 512 bytes long (although we don't check
-    // or care about that in the page server).
-    //
-    FileNodeMap { spcnode: Oid, dbnode: Oid },
-
-    //
-    // State files for prepared transactions (e.g pg_twophase/1234)
-    //
-    TwoPhase { xid: TransactionId },
-
-    // The control file, stored in global/pg_control
-    ControlFile,
-
-    // Special entry that represents PostgreSQL checkpoint. It doesn't
-    // correspond to to any physical file in PostgreSQL, but we use it
-    // to track fields needed to restore the checkpoint data in the
-    // control file, when a compute node is created.
-    Checkpoint,
-}
-
-impl RelishTag {
-    pub const fn is_blocky(&self) -> bool {
-        match self {
-            // These relishes work with blocks
-            RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true,
-
-            // and these don't
-            RelishTag::FileNodeMap {
-                spcnode: _,
-                dbnode: _,
-            }
-            | RelishTag::TwoPhase { xid: _ }
-            | RelishTag::ControlFile
-            | RelishTag::Checkpoint => false,
-        }
-    }
-
-    // Physical relishes represent files and use
-    // RelationSizeEntry to track existing and dropped files.
-    // They can be both blocky and non-blocky.
-    pub const fn is_physical(&self) -> bool {
-        match self {
-            // These relishes represent physical files
-            RelishTag::Relation(_)
-            | RelishTag::Slru { .. }
-            | RelishTag::FileNodeMap { .. }
-            | RelishTag::TwoPhase { .. } => true,
-
-            // and these don't
-            RelishTag::ControlFile | RelishTag::Checkpoint => false,
-        }
-    }
-
-    // convenience function to check if this relish is a normal relation.
-    pub const fn is_relation(&self) -> bool {
-        matches!(self, RelishTag::Relation(_))
-    }
-}
+use postgres_ffi::Oid;

 ///
 /// Relation data file segment id throughout the Postgres cluster.
@@ -144,7 +20,10 @@ impl RelishTag {
 /// are used for the same purpose.
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
+// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
+// Then we could replace the custo Ord and PartialOrd implementations below with
+// deriving them.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -152,6 +31,34 @@ pub struct RelTag {
    pub relnode: Oid,
 }

+impl PartialOrd for RelTag {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for RelTag {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.spcnode.cmp(&other.spcnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.dbnode.cmp(&other.dbnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.relnode.cmp(&other.relnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.forknum.cmp(&other.forknum);
+
+        cmp
+    }
+}
+
 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
@@ -170,34 +77,6 @@ impl fmt::Display for RelTag {
    }
 }

-/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
-///
-/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-///
-impl fmt::Display for RelishTag {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            RelishTag::Relation(rel) => rel.fmt(f),
-            RelishTag::Slru { slru, segno } => {
-                // e.g. pg_clog/0001
-                write!(f, "{}/{:04X}", slru.to_str(), segno)
-            }
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                write!(f, "relmapper file for spc {} db {}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => {
-                write!(f, "pg_twophase/{:08X}", xid)
-            }
-            RelishTag::ControlFile => {
-                write!(f, "control file")
-            }
-            RelishTag::Checkpoint => {
-                write!(f, "checkpoint")
-            }
-        }
-    }
-}
-
 ///
 /// Non-relation transaction status files (clog (a.k.a. pg_xact) and
 /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -641,7 +641,7 @@ mod fs_tests {
    }

    async fn upload_dummy_file(
-        harness: &RepoHarness,
+        harness: &RepoHarness<'_>,
        storage: &LocalFs,
        name: &str,
    ) -> anyhow::Result<PathBuf> {
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
@@ -849,7 +849,7 @@ mod test_utils {

    #[track_caller]
    pub async fn ensure_correct_timeline_upload(
-        harness: &RepoHarness,
+        harness: &RepoHarness<'_>,
        remote_assets: Arc<(LocalFs, RwLock<RemoteTimelineIndex>)>,
        timeline_id: ZTimelineId,
        new_upload: NewCheckpoint,
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -3,12 +3,13 @@

 use crate::config::PageServerConf;
 use crate::layered_repository::LayeredRepository;
-use crate::repository::{Repository, Timeline, TimelineSyncState};
+use crate::repository::Repository;
+use crate::repository::TimelineSyncState;
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
 use crate::timelines;
 use crate::walredo::PostgresRedoManager;
-use crate::CheckpointConfig;
+use crate::{DatadirTimelineImpl, RepositoryImpl};
 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use log::*;
@@ -24,7 +25,9 @@ lazy_static! {

 struct Tenant {
    state: TenantState,
-    repo: Arc<dyn Repository>,
+    repo: Arc<RepositoryImpl>,
+
+    timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
 }

 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -78,15 +81,17 @@ pub fn set_timeline_states(
            let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);

            // Set up an object repository, for actual data storage.
-            let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
+            let repo = LayeredRepository::new(
                conf,
                Arc::new(walredo_mgr),
                tenant_id,
                conf.remote_storage_config.is_some(),
-            ));
+            );
+
            Tenant {
                state: TenantState::Idle,
-                repo,
+                repo: Arc::new(repo),
+                timelines: HashMap::new(),
            }
        });
        if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
@@ -146,7 +151,7 @@ pub fn shutdown_all_tenants() {

    thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None);
    thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
-    thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None);
+    thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);

    // Ok, no background threads running anymore. Flush any remaining data in
    // memory to disk.
@@ -160,7 +165,7 @@ pub fn shutdown_all_tenants() {
        debug!("shutdown tenant {}", tenantid);
        match get_repository_for_tenant(tenantid) {
            Ok(repo) => {
-                if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) {
+                if let Err(err) = repo.checkpoint() {
                    error!(
                        "Could not checkpoint tenant {} during shutdown: {:?}",
                        tenantid, err
@@ -190,6 +195,7 @@ pub fn create_tenant_repository(
                .or_insert_with(|| Tenant {
                    state: TenantState::Idle,
                    repo,
+                    timelines: HashMap::new(),
                });
            Ok(Some(new_tenant_id))
        }
@@ -205,7 +211,7 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
 }

 ///
-/// Change the state of a tenant to Active and launch its checkpointer and GC
+/// Change the state of a tenant to Active and launch its compactor and GC
 /// threads. If the tenant was already in Active state or Stopping, does nothing.
 ///
 pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> {
@@ -220,18 +226,18 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re
        // If the tenant is already active, nothing to do.
        TenantState::Active => {}

-        // If it's Idle, launch the checkpointer and GC threads
+        // If it's Idle, launch the compactor and GC threads
        TenantState::Idle => {
            thread_mgr::spawn(
-                ThreadKind::Checkpointer,
+                ThreadKind::Compactor,
                Some(tenantid),
                None,
-                "Checkpointer thread",
-                move || crate::tenant_threads::checkpoint_loop(tenantid, conf),
+                "Compactor thread",
+                move || crate::tenant_threads::compact_loop(tenantid, conf),
            )?;

            // FIXME: if we fail to launch the GC thread, but already launched the
-            // checkpointer, we're in a strange state.
+            // compactor, we're in a strange state.

            thread_mgr::spawn(
                ThreadKind::GarbageCollector,
@@ -251,7 +257,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re
    Ok(())
 }

-pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
+pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryImpl>> {
    let m = access_tenants();
    let tenant = m
        .get(&tenantid)
@@ -263,11 +269,28 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Reposito
 pub fn get_timeline_for_tenant(
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-) -> Result<Arc<dyn Timeline>> {
-    get_repository_for_tenant(tenantid)?
+) -> Result<Arc<DatadirTimelineImpl>> {
+    let mut m = access_tenants();
+    let tenant = m
+        .get_mut(&tenantid)
+        .with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
+
+    if let Some(page_tline) = tenant.timelines.get(&timelineid) {
+        return Ok(Arc::clone(page_tline));
+    }
+    // First access to this timeline. Create a DatadirTimeline wrapper for it
+    let tline = tenant
+        .repo
        .get_timeline(timelineid)?
        .local_timeline()
-        .with_context(|| format!("cannot fetch timeline {}", timelineid))
+        .with_context(|| format!("cannot fetch timeline {}", timelineid))?;
+
+    let repartition_distance = tenant.repo.conf.checkpoint_distance / 10;
+
+    let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance));
+    page_tline.init_logical_size()?;
+    tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
+    Ok(page_tline)
 }

 #[derive(Serialize, Deserialize, Clone)]
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -1,34 +1,42 @@
 //! This module contains functions to serve per-tenant background processes,
-//! such as checkpointer and GC
+//! such as compaction and GC
 use crate::config::PageServerConf;
+use crate::repository::Repository;
 use crate::tenant_mgr;
 use crate::tenant_mgr::TenantState;
-use crate::CheckpointConfig;
 use anyhow::Result;
 use std::time::Duration;
 use tracing::*;
 use zenith_utils::zid::ZTenantId;

 ///
-/// Checkpointer thread's main loop
+/// Compaction thread's main loop
 ///
-pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    if let Err(err) = compact_loop_ext(tenantid, conf) {
+        error!("compact loop terminated with error: {:?}", err);
+        Err(err)
+    } else {
+        Ok(())
+    }
+}
+
+fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
    loop {
        if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
            break;
        }

-        std::thread::sleep(conf.checkpoint_period);
-        trace!("checkpointer thread for tenant {} waking up", tenantid);
+        std::thread::sleep(conf.compaction_period);
+        trace!("compaction thread for tenant {} waking up", tenantid);

-        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
-        // bytes of WAL since last checkpoint.
+        // Compact timelines
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+        repo.compaction_iteration()?;
    }

    trace!(
-        "checkpointer thread stopped for tenant {} state is {:?}",
+        "compaction thread stopped for tenant {} state is {:?}",
        tenantid,
        tenant_mgr::get_tenant_state(tenantid)
    );
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -92,12 +92,15 @@ pub enum ThreadKind {
    // Thread that connects to a safekeeper to fetch WAL for one timeline.
    WalReceiver,

-    // Thread that handles checkpointing of all timelines for a tenant.
-    Checkpointer,
+    // Thread that handles compaction of all timelines for a tenant.
+    Compactor,

    // Thread that handles GC of a tenant
    GarbageCollector,

+    // Thread that flushes frozen in-memory layers to disk
+    LayerFlushThread,
+
    // Thread for synchronizing pageserver relish data with the remote storage.
    // Shared by all tenants.
    StorageSync,
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -16,6 +16,8 @@ use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};
 use zenith_utils::{crashsafe_dir, logging};

+use crate::DatadirTimeline;
+use crate::RepositoryImpl;
 use crate::{config::PageServerConf, repository::Repository};
 use crate::{import_datadir, LOG_FILE_NAME};
 use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
@@ -43,12 +45,13 @@ pub enum TimelineInfo {
 }

 impl TimelineInfo {
-    pub fn from_repo_timeline(
+    pub fn from_ids(
        tenant_id: ZTenantId,
-        repo_timeline: RepositoryTimeline,
+        timeline_id: ZTimelineId,
        include_non_incremental_logical_size: bool,
-    ) -> Self {
-        match repo_timeline {
+    ) -> Result<Self> {
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        let result = match repo.get_timeline(timeline_id)? {
            RepositoryTimeline::Local { id, timeline } => {
                let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
                let ancestor_lsn = if ancestor_timeline_id.is_some() {
@@ -57,6 +60,13 @@ impl TimelineInfo {
                    None
                };

+                let tline = tenant_mgr::get_timeline_for_tenant(tenant_id, timeline_id)?;
+                let current_logical_size = tline.get_current_logical_size();
+                let current_logical_size_non_incremental = get_current_logical_size_non_incremental(
+                    include_non_incremental_logical_size,
+                    tline.as_ref(),
+                );
+
                Self::Local {
                    timeline_id: id,
                    tenant_id,
@@ -65,11 +75,8 @@ impl TimelineInfo {
                    ancestor_timeline_id,
                    ancestor_lsn,
                    disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-                    current_logical_size: timeline.get_current_logical_size(),
-                    current_logical_size_non_incremental: get_current_logical_size_non_incremental(
-                        include_non_incremental_logical_size,
-                        timeline.as_ref(),
-                    ),
+                    current_logical_size,
+                    current_logical_size_non_incremental,
                }
            }
            RepositoryTimeline::Remote {
@@ -80,36 +87,8 @@ impl TimelineInfo {
                tenant_id,
                disk_consistent_lsn,
            },
-        }
-    }
-
-    pub fn from_dyn_timeline(
-        tenant_id: ZTenantId,
-        timeline_id: ZTimelineId,
-        timeline: &dyn Timeline,
-        include_non_incremental_logical_size: bool,
-    ) -> Self {
-        let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
-        let ancestor_lsn = if ancestor_timeline_id.is_some() {
-            Some(timeline.get_ancestor_lsn())
-        } else {
-            None
        };
-
-        Self::Local {
-            timeline_id,
-            tenant_id,
-            last_record_lsn: timeline.get_last_record_lsn(),
-            prev_record_lsn: timeline.get_prev_record_lsn(),
-            ancestor_timeline_id,
-            ancestor_lsn,
-            disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental: get_current_logical_size_non_incremental(
-                include_non_incremental_logical_size,
-                timeline,
-            ),
-        }
+        Ok(result)
    }

    pub fn timeline_id(&self) -> ZTimelineId {
@@ -127,9 +106,9 @@ impl TimelineInfo {
    }
 }

-fn get_current_logical_size_non_incremental(
+fn get_current_logical_size_non_incremental<R: Repository>(
    include_non_incremental_logical_size: bool,
-    timeline: &dyn Timeline,
+    timeline: &DatadirTimeline<R>,
 ) -> Option<usize> {
    if !include_non_incremental_logical_size {
        return None;
@@ -193,7 +172,7 @@ pub fn create_repo(
    conf: &'static PageServerConf,
    tenant_id: ZTenantId,
    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
-) -> Result<Option<Arc<dyn Repository>>> {
+) -> Result<Option<Arc<RepositoryImpl>>> {
    let repo_dir = conf.tenant_path(&tenant_id);
    if repo_dir.exists() {
        debug!("repo for {} already exists", tenant_id);
@@ -259,12 +238,12 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
 // - run initdb to init temporary instance and get bootstrap data
 // - after initialization complete, remove the temp dir.
 //
-fn bootstrap_timeline(
+fn bootstrap_timeline<R: Repository>(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    tli: ZTimelineId,
-    repo: &dyn Repository,
-) -> Result<Arc<dyn Timeline>> {
+    repo: &R,
+) -> Result<()> {
    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();

    let initdb_path = conf.tenant_path(&tenantid).join("tmp");
@@ -280,23 +259,20 @@ fn bootstrap_timeline(
    // Initdb lsn will be equal to last_record_lsn which will be set after import.
    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
    let timeline = repo.create_empty_timeline(tli, lsn)?;
-    import_datadir::import_timeline_from_postgres_datadir(
-        &pgdata_path,
-        timeline.writer().as_ref(),
-        lsn,
-    )?;
-    timeline.checkpoint(CheckpointConfig::Forced)?;
+    let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
+    import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
+    page_tline.tline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
        tli,
-        timeline.get_last_record_lsn()
+        page_tline.tline.get_last_record_lsn()
    );

    // Remove temp dir. We don't need it anymore
    fs::remove_dir_all(pgdata_path)?;

-    Ok(timeline)
+    Ok(())
 }

 pub(crate) fn get_timelines(
@@ -306,23 +282,26 @@ pub(crate) fn get_timelines(
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;

-    Ok(repo
+    let mut result = Vec::new();
+    for timeline in repo
        .list_timelines()
        .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))?
-        .into_iter()
-        .filter_map(|timeline| match timeline {
-            RepositoryTimeline::Local { timeline, id } => Some((id, timeline)),
-            RepositoryTimeline::Remote { .. } => None,
-        })
-        .map(|(timeline_id, timeline)| {
-            TimelineInfo::from_dyn_timeline(
-                tenant_id,
-                timeline_id,
-                timeline.as_ref(),
-                include_non_incremental_logical_size,
-            )
-        })
-        .collect())
+    {
+        match timeline {
+            RepositoryTimeline::Local {
+                timeline: _,
+                id: timeline_id,
+            } => {
+                result.push(TimelineInfo::from_ids(
+                    tenant_id,
+                    timeline_id,
+                    include_non_incremental_logical_size,
+                )?);
+            }
+            RepositoryTimeline::Remote { .. } => continue,
+        }
+    }
+    Ok(result)
 }

 pub(crate) fn create_timeline(
@@ -350,7 +329,7 @@ pub(crate) fn create_timeline(

    let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));

-    let new_timeline_info = match ancestor_timeline_id {
+    match ancestor_timeline_id {
        Some(ancestor_timeline_id) => {
            let ancestor_timeline = repo
                .get_timeline(ancestor_timeline_id)
@@ -390,19 +369,13 @@ pub(crate) fn create_timeline(
                );
            }
            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
-            // load the timeline into memory
-            let loaded_timeline = repo.get_timeline(new_timeline_id)?;
-            TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false)
        }
        None => {
-            let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
-            TimelineInfo::from_dyn_timeline(
-                tenant_id,
-                new_timeline_id,
-                new_timeline.as_ref(),
-                false,
-            )
+            bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
        }
-    };
+    }
+
+    let new_timeline_info = TimelineInfo::from_ids(tenant_id, new_timeline_id, false)?;
+
    Ok(Some(new_timeline_info))
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -6,6 +6,7 @@
 //! We keep one WAL receiver active per timeline.

 use crate::config::PageServerConf;
+use crate::repository::{Repository, Timeline};
 use crate::tenant_mgr;
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
@@ -250,11 +251,10 @@ fn walreceiver_main(

                    // It is important to deal with the aligned records as lsn in getPage@LSN is
                    // aligned and can be several bytes bigger. Without this alignment we are
-                    // at risk of hittind a deadlock.
+                    // at risk of hitting a deadlock.
                    assert!(lsn.is_aligned());

-                    let writer = timeline.writer();
-                    walingest.ingest_record(writer.as_ref(), recdata, lsn)?;
+                    walingest.ingest_record(&timeline, recdata, lsn)?;

                    fail_point!("walreceiver-after-ingest");

@@ -266,6 +266,8 @@ fn walreceiver_main(
                    caught_up = true;
                }

+                timeline.tline.check_checkpoint_distance()?;
+
                Some(endlsn)
            }

@@ -301,7 +303,7 @@ fn walreceiver_main(
            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let write_lsn = u64::from(last_lsn);
            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
+            let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn());
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
            let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn);
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti
 use serde::{Deserialize, Serialize};
 use tracing::*;

-use crate::repository::ZenithWalRecord;
+/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
+/// around a PostgreSQL WAL record, or a custom zenith-specific "record".
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum ZenithWalRecord {
+    /// Native PostgreSQL WAL record
+    Postgres { will_init: bool, rec: Bytes },
+
+    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
+    ClearVisibilityMapFlags {
+        new_heap_blkno: Option<u32>,
+        old_heap_blkno: Option<u32>,
+        flags: u8,
+    },
+    /// Mark transaction IDs as committed on a CLOG page
+    ClogSetCommitted { xids: Vec<TransactionId> },
+    /// Mark transaction IDs as aborted on a CLOG page
+    ClogSetAborted { xids: Vec<TransactionId> },
+    /// Extend multixact offsets SLRU
+    MultixactOffsetCreate {
+        mid: MultiXactId,
+        moff: MultiXactOffset,
+    },
+    /// Extend multixact members SLRU.
+    MultixactMembersCreate {
+        moff: MultiXactOffset,
+        members: Vec<MultiXactMember>,
+    },
+}
+
+impl ZenithWalRecord {
+    /// Does replaying this WAL record initialize the page from scratch, or does
+    /// it need to be applied over the previous image of the page?
+    pub fn will_init(&self) -> bool {
+        match self {
+            ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init,
+
+            // None of the special zenith record types currently initialize the page
+            _ => false,
+        }
+    }
+}

 /// DecodedBkpBlock represents per-page data contained in a WAL record.
 #[derive(Default)]
@@ -87,6 +127,28 @@ impl XlRelmapUpdate {
    }
 }

+#[repr(C)]
+#[derive(Debug)]
+pub struct XlSmgrCreate {
+    pub rnode: RelFileNode,
+    // FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have
+    // well-defined size?
+    pub forknum: u8,
+}
+
+impl XlSmgrCreate {
+    pub fn decode(buf: &mut Bytes) -> XlSmgrCreate {
+        XlSmgrCreate {
+            rnode: RelFileNode {
+                spcnode: buf.get_u32_le(), /* tablespace */
+                dbnode: buf.get_u32_le(),  /* database */
+                relnode: buf.get_u32_le(), /* relation */
+            },
+            forknum: buf.get_u32_le() as u8,
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrTruncate {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock;
 use zenith_utils::zid::ZTenantId;

 use crate::config::PageServerConf;
+use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::relish::*;
-use crate::repository::ZenithWalRecord;
+use crate::repository::Key;
+use crate::walrecord::ZenithWalRecord;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
@@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync {
    /// the reords.
    fn request_redo(
        &self,
-        rel: RelishTag,
-        blknum: u32,
+        key: Key,
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: Vec<(Lsn, ZenithWalRecord)>,
@@ -92,8 +93,7 @@ pub struct DummyRedoManager {}
 impl crate::walredo::WalRedoManager for DummyRedoManager {
    fn request_redo(
        &self,
-        _rel: RelishTag,
-        _blknum: u32,
+        _key: Key,
        _lsn: Lsn,
        _base_img: Option<Bytes>,
        _records: Vec<(Lsn, ZenithWalRecord)>,
@@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool {
    }
 }

-fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool {
-    if let RelishTag::Relation(RelTag {
-        forknum,
-        spcnode: _,
-        dbnode: _,
-        relnode: _,
-    }) = rel
-    {
-        *forknum == expected_forknum
-    } else {
-        false
-    }
-}
-
-fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool {
-    if let RelishTag::Slru { slru, segno } = rel {
-        *slru == expected_slru && *segno == expected_segno
-    } else {
-        false
-    }
-}
-
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -184,6 +162,8 @@ pub enum WalRedoError {
    InvalidState,
    #[error("cannot perform WAL redo for this request")]
    InvalidRequest,
+    #[error("cannot perform WAL redo for this record")]
+    InvalidRecord,
 }

 ///
@@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
    ///
    fn request_redo(
        &self,
-        rel: RelishTag,
-        blknum: u32,
+        key: Key,
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: Vec<(Lsn, ZenithWalRecord)>,
@@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager {

            if rec_zenith != batch_zenith {
                let result = if batch_zenith {
-                    self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
+                    self.apply_batch_zenith(key, lsn, img, &records[batch_start..i])
                } else {
                    self.apply_batch_postgres(
-                        rel,
-                        blknum,
+                        key,
                        lsn,
                        img,
                        &records[batch_start..i],
@@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager {
        }
        // last batch
        if batch_zenith {
-            self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
+            self.apply_batch_zenith(key, lsn, img, &records[batch_start..])
        } else {
            self.apply_batch_postgres(
-                rel,
-                blknum,
+                key,
                lsn,
                img,
                &records[batch_start..],
@@ -268,16 +245,15 @@ impl PostgresRedoManager {
    ///
    fn apply_batch_postgres(
        &self,
-        rel: RelishTag,
-        blknum: u32,
+        key: Key,
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: &[(Lsn, ZenithWalRecord)],
        wal_redo_timeout: Duration,
    ) -> Result<Bytes, WalRedoError> {
-        let start_time = Instant::now();
+        let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;

-        let apply_result: Result<Bytes, Error>;
+        let start_time = Instant::now();

        let mut process_guard = self.process.lock().unwrap();
        let lock_time = Instant::now();
@@ -291,16 +267,11 @@ impl PostgresRedoManager {

        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

-        let result = if let RelishTag::Relation(rel) = rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);
-
-            apply_result.map_err(WalRedoError::IoError)
-        } else {
-            error!("unexpected non-relation relish: {:?}", rel);
-            Err(WalRedoError::InvalidRequest)
-        };
+        // Relational WAL records are applied using wal-redo-postgres
+        let buf_tag = BufferTag { rel, blknum };
+        let result = process
+            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+            .map_err(WalRedoError::IoError);

        let end_time = Instant::now();
        let duration = end_time.duration_since(lock_time);
@@ -326,8 +297,7 @@ impl PostgresRedoManager {
    ///
    fn apply_batch_zenith(
        &self,
-        rel: RelishTag,
-        blknum: u32,
+        key: Key,
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: &[(Lsn, ZenithWalRecord)],
@@ -346,7 +316,7 @@ impl PostgresRedoManager {

        // Apply all the WAL records in the batch
        for (record_lsn, record) in records.iter() {
-            self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?;
+            self.apply_record_zenith(key, &mut page, *record_lsn, record)?;
        }
        // Success!
        let end_time = Instant::now();
@@ -365,8 +335,7 @@ impl PostgresRedoManager {

    fn apply_record_zenith(
        &self,
-        rel: RelishTag,
-        blknum: u32,
+        key: Key,
        page: &mut BytesMut,
        _record_lsn: Lsn,
        record: &ZenithWalRecord,
@@ -382,9 +351,10 @@ impl PostgresRedoManager {
                flags,
            } => {
                // sanity check that this is modifying the correct relish
+                let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert!(
-                    check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
-                    "ClearVisibilityMapFlags record on unexpected rel {:?}",
+                    rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
+                    "ClearVisibilityMapFlags record on unexpected rel {}",
                    rel
                );
                if let Some(heap_blkno) = *new_heap_blkno {
@@ -418,6 +388,14 @@ impl PostgresRedoManager {
            // Non-relational WAL records are handled here, with custom code that has the
            // same effects as the corresponding Postgres WAL redo function.
            ZenithWalRecord::ClogSetCommitted { xids } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetCommitted record with unexpected key {}",
+                    key
+                );
                for &xid in xids {
                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -425,12 +403,17 @@ impl PostgresRedoManager {

                    // Check that we're modifying the correct CLOG block.
                    assert!(
-                        check_slru_segno(&rel, SlruKind::Clog, expected_segno),
-                        "ClogSetCommitted record for XID {} with unexpected rel {:?}",
+                        segno == expected_segno,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
                        xid,
-                        rel
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
+                        xid,
+                        key
                    );
-                    assert!(blknum == expected_blknum);

                    transaction_id_set_status(
                        xid,
@@ -440,6 +423,14 @@ impl PostgresRedoManager {
                }
            }
            ZenithWalRecord::ClogSetAborted { xids } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetAborted record with unexpected key {}",
+                    key
+                );
                for &xid in xids {
                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -447,17 +438,30 @@ impl PostgresRedoManager {

                    // Check that we're modifying the correct CLOG block.
                    assert!(
-                        check_slru_segno(&rel, SlruKind::Clog, expected_segno),
-                        "ClogSetCommitted record for XID {} with unexpected rel {:?}",
+                        segno == expected_segno,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
                        xid,
-                        rel
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
+                        xid,
+                        key
                    );
-                    assert!(blknum == expected_blknum);

                    transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
                }
            }
            ZenithWalRecord::MultixactOffsetCreate { mid, moff } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactOffsets,
+                    "MultixactOffsetCreate record with unexpected key {}",
+                    key
+                );
                // Compute the block and offset to modify.
                // See RecordNewMultiXact in PostgreSQL sources.
                let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
@@ -468,16 +472,29 @@ impl PostgresRedoManager {
                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                assert!(
-                    check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno),
-                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}",
+                    segno == expected_segno,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
                    mid,
-                    rel
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                    mid,
+                    key
                );
-                assert!(blknum == expected_blknum);

                LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
            }
            ZenithWalRecord::MultixactMembersCreate { moff, members } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactMembers,
+                    "MultixactMembersCreate record with unexpected key {}",
+                    key
+                );
                for (i, member) in members.iter().enumerate() {
                    let offset = moff + i as u32;

@@ -492,12 +509,17 @@ impl PostgresRedoManager {
                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    assert!(
-                        check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno),
-                        "MultiXactMembersCreate record at offset {} with unexpected rel {:?}",
+                        segno == expected_segno,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
                        moff,
-                        rel
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                        moff,
+                        key
                    );
-                    assert!(blknum == expected_blknum);

                    let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
                    flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2;
 pub const INIT_FORKNUM: u8 = 3;

 // From storage_xlog.h
+pub const XLOG_SMGR_CREATE: u8 = 0x10;
+pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
+
 pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
 pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
 pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
@@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
 // From pg_control.h and rmgrlist.h
 pub const XLOG_NEXTOID: u8 = 0x30;
 pub const XLOG_SWITCH: u8 = 0x40;
-pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
 pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
 pub const XLOG_FPI: u8 = 0xB0;
 pub const DB_SHUTDOWNED: u32 = 1;
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -132,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz {
    }
 }

+/// Return offset of the last valid record in the segment segno, starting
+/// looking at start_offset. Returns start_offset if no records found.
 fn find_end_of_wal_segment(
    data_dir: &Path,
    segno: XLogSegNo,
@@ -147,7 +149,7 @@ fn find_end_of_wal_segment(
    let mut rec_offs: usize = 0;
    let mut buf = [0u8; XLOG_BLCKSZ];
    let file_name = XLogFileName(tli, segno, wal_seg_size);
-    let mut last_valid_rec_pos: usize = 0;
+    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
    file.seek(SeekFrom::Start(offs as u64))?;
    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = { version = "1.0.1", features = ['serde'] }
 clap = "3.0"
+fail = "0.5.0"
 futures = "0.3.13"
 hashbrown = "0.11.2"
 hex = "0.4.3"
@@ -21,6 +22,7 @@ rustls = "0.19.1"
 scopeguard = "1.1.0"
 serde = "1"
 serde_json = "1"
+thiserror = "1.0"
 tokio = { version = "1.11", features = ["macros"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 tokio-rustls = "0.22.0"
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,11 +1,79 @@
 use crate::compute::DatabaseInfo;
 use crate::config::ProxyConfig;
 use crate::cplane_api::{self, CPlaneApi};
+use crate::error::UserFacingError;
 use crate::stream::PqStream;
-use anyhow::{anyhow, bail, Context};
+use crate::waiters;
 use std::collections::HashMap;
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe};
+use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+/// Common authentication error.
+#[derive(Debug, Error)]
+pub enum AuthErrorImpl {
+    /// Authentication error reported by the console.
+    #[error(transparent)]
+    Console(#[from] cplane_api::AuthError),
+
+    /// For passwords that couldn't be processed by [`parse_password`].
+    #[error("Malformed password message")]
+    MalformedPassword,
+
+    /// Errors produced by [`PqStream`].
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+}
+
+impl AuthErrorImpl {
+    pub fn auth_failed(msg: impl Into<String>) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg))
+    }
+}
+
+impl From<waiters::RegisterError> for AuthErrorImpl {
+    fn from(e: waiters::RegisterError) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+    }
+}
+
+impl From<waiters::WaitError> for AuthErrorImpl {
+    fn from(e: waiters::WaitError) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+    }
+}
+
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct AuthError(Box<AuthErrorImpl>);
+
+impl<T> From<T> for AuthError
+where
+    AuthErrorImpl: From<T>,
+{
+    fn from(e: T) -> Self {
+        AuthError(Box::new(e.into()))
+    }
+}
+
+impl UserFacingError for AuthError {
+    fn to_string_client(&self) -> String {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            Console(e) => e.to_string_client(),
+            MalformedPassword => self.to_string(),
+            _ => "Internal error".to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum ClientCredsParseError {
+    #[error("Parameter `{0}` is missing in startup packet")]
+    MissingKey(&'static str),
+}
+
+impl UserFacingError for ClientCredsParseError {}

 /// Various client credentials which we use for authentication.
 #[derive(Debug, PartialEq, Eq)]
@@ -15,13 +83,13 @@ pub struct ClientCredentials {
 }

 impl TryFrom<HashMap<String, String>> for ClientCredentials {
-    type Error = anyhow::Error;
+    type Error = ClientCredsParseError;

    fn try_from(mut value: HashMap<String, String>) -> Result<Self, Self::Error> {
        let mut get_param = |key| {
            value
                .remove(key)
-                .with_context(|| format!("{} is missing in startup packet", key))
+                .ok_or(ClientCredsParseError::MissingKey(key))
        };

        let user = get_param("user")?;
@@ -37,10 +105,14 @@ impl ClientCredentials {
        self,
        config: &ProxyConfig,
        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> anyhow::Result<DatabaseInfo> {
+    ) -> Result<DatabaseInfo, AuthError> {
+        fail::fail_point!("proxy-authenticate", |_| {
+            Err(AuthError::auth_failed("failpoint triggered"))
+        });
+
        use crate::config::ClientAuthMethod::*;
        use crate::config::RouterConfig::*;
-        let db_info = match &config.router_config {
+        match &config.router_config {
            Static { host, port } => handle_static(host.clone(), *port, client, self).await,
            Dynamic(Mixed) => {
                if self.user.ends_with("@zenith") {
@@ -51,9 +123,7 @@ impl ClientCredentials {
            }
            Dynamic(Password) => handle_existing_user(config, client, self).await,
            Dynamic(Link) => handle_new_user(config, client).await,
-        };
-
-        db_info.context("failed to authenticate client")
+        }
    }
 }

@@ -66,18 +136,14 @@ async fn handle_static(
    port: u16,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    creds: ClientCredentials,
-) -> anyhow::Result<DatabaseInfo> {
+) -> Result<DatabaseInfo, AuthError> {
    client
        .write_message(&Be::AuthenticationCleartextPassword)
        .await?;

    // Read client's password bytes
-    let msg = match client.read_message().await? {
-        Fe::PasswordMessage(msg) => msg,
-        bad => bail!("unexpected message type: {:?}", bad),
-    };
-
-    let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap();
+    let msg = client.read_password_message().await?;
+    let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;

    let db_info = DatabaseInfo {
        host,
@@ -98,7 +164,7 @@ async fn handle_existing_user(
    config: &ProxyConfig,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    creds: ClientCredentials,
-) -> anyhow::Result<DatabaseInfo> {
+) -> Result<DatabaseInfo, AuthError> {
    let psql_session_id = new_psql_session_id();
    let md5_salt = rand::random();

@@ -107,18 +173,12 @@ async fn handle_existing_user(
        .await?;

    // Read client's password hash
-    let msg = match client.read_message().await? {
-        Fe::PasswordMessage(msg) => msg,
-        bad => bail!("unexpected message type: {:?}", bad),
-    };
+    let msg = client.read_password_message().await?;
+    let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;

-    let (_trailing_null, md5_response) = msg
-        .split_last()
-        .ok_or_else(|| anyhow!("unexpected password message"))?;
-
-    let cplane = CPlaneApi::new(&config.auth_endpoint);
+    let cplane = CPlaneApi::new(config.auth_endpoint.clone());
    let db_info = cplane
-        .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id)
+        .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id)
        .await?;

    client
@@ -131,7 +191,7 @@ async fn handle_existing_user(
 async fn handle_new_user(
    config: &ProxyConfig,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> anyhow::Result<DatabaseInfo> {
+) -> Result<DatabaseInfo, AuthError> {
    let psql_session_id = new_psql_session_id();
    let greeting = hello_message(&config.redirect_uri, &psql_session_id);

@@ -140,19 +200,23 @@ async fn handle_new_user(
        client
            .write_message_noflush(&Be::AuthenticationOk)?
            .write_message_noflush(&BeParameterStatusMessage::encoding())?
-            .write_message(&Be::NoticeResponse(greeting))
+            .write_message(&Be::NoticeResponse(&greeting))
            .await?;

-        // Wait for web console response
-        waiter.await?.map_err(|e| anyhow!(e))
+        // Wait for web console response (see `mgmt`)
+        waiter.await?.map_err(AuthErrorImpl::auth_failed)
    })
    .await?;

-    client.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
+    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

    Ok(db_info)
 }

+fn parse_password(bytes: &[u8]) -> Option<&str> {
+    std::str::from_utf8(bytes).ok()?.strip_suffix('\0')
+}
+
 fn hello_message(redirect_uri: &str, session_id: &str) -> String {
    format!(
        concat![
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -6,7 +6,7 @@ use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use zenith_utils::pq_proto::CancelKeyData;

-/// Enables serving CancelRequests.
+/// Enables serving `CancelRequest`s.
 #[derive(Default)]
 pub struct CancelMap(Mutex<HashMap<CancelKeyData, Option<CancelClosure>>>);

--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,27 @@
-use anyhow::Context;
+use crate::cancellation::CancelClosure;
+use crate::error::UserFacingError;
 use serde::{Deserialize, Serialize};
-use std::net::{SocketAddr, ToSocketAddrs};
+use std::io;
+use std::net::SocketAddr;
+use thiserror::Error;
+use tokio::net::TcpStream;
+use tokio_postgres::NoTls;
+
+#[derive(Debug, Error)]
+pub enum ConnectionError {
+    /// This error doesn't seem to reveal any secrets; for instance,
+    /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
+    #[error("Failed to connect to the compute node: {0}")]
+    Postgres(#[from] tokio_postgres::Error),
+
+    #[error("Failed to connect to the compute node")]
+    FailedToConnectToCompute,
+
+    #[error("Failed to fetch compute node version")]
+    FailedToFetchPgVersion,
+}
+
+impl UserFacingError for ConnectionError {}

 /// Compute node connection params.
 #[derive(Serialize, Deserialize, Debug, Default)]
@@ -12,14 +33,38 @@ pub struct DatabaseInfo {
    pub password: Option<String>,
 }

+/// PostgreSQL version as [`String`].
+pub type Version = String;
+
 impl DatabaseInfo {
-    pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
+    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
        let host_port = format!("{}:{}", self.host, self.port);
-        host_port
-            .to_socket_addrs()
-            .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
-            .next()
-            .context("cannot resolve at least one SocketAddr")
+        let socket = TcpStream::connect(host_port).await?;
+        let socket_addr = socket.peer_addr()?;
+
+        Ok((socket_addr, socket))
+    }
+
+    /// Connect to a corresponding compute node.
+    pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> {
+        let (socket_addr, mut socket) = self
+            .connect_raw()
+            .await
+            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;
+
+        // TODO: establish a secure connection to the DB
+        let (client, conn) = tokio_postgres::Config::from(self)
+            .connect_raw(&mut socket, NoTls)
+            .await?;
+
+        let version = conn
+            .parameter("server_version")
+            .ok_or(ConnectionError::FailedToFetchPgVersion)?
+            .into();
+
+        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
+
+        Ok((socket, version, cancel_closure))
    }
 }

--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, ensure, Context};
+use anyhow::{anyhow, bail, ensure, Context};
 use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
 use std::net::SocketAddr;
 use std::str::FromStr;
@@ -29,7 +29,7 @@ impl FromStr for ClientAuthMethod {
            "password" => Ok(Password),
            "link" => Ok(Link),
            "mixed" => Ok(Mixed),
-            _ => Err(anyhow::anyhow!("Invlid option for router")),
+            _ => bail!("Invalid option for router: `{}`", s),
        }
    }
 }
@@ -53,7 +53,7 @@ pub struct ProxyConfig {
    pub redirect_uri: String,

    /// control plane address where we would check auth.
-    pub auth_endpoint: String,
+    pub auth_endpoint: reqwest::Url,

    pub tls_config: Option<TlsConfig>,
 }
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -1,52 +1,113 @@
 use crate::auth::ClientCredentials;
 use crate::compute::DatabaseInfo;
-use crate::waiters::{Waiter, Waiters};
-use anyhow::{anyhow, bail};
+use crate::error::UserFacingError;
+use crate::mgmt;
+use crate::waiters::{self, Waiter, Waiters};
 use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
+use thiserror::Error;

 lazy_static! {
-    static ref CPLANE_WAITERS: Waiters<Result<DatabaseInfo, String>> = Default::default();
+    static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
 }

 /// Give caller an opportunity to wait for cplane's reply.
-pub async fn with_waiter<F, R, T>(psql_session_id: impl Into<String>, f: F) -> anyhow::Result<T>
+pub async fn with_waiter<R, T, E>(
+    psql_session_id: impl Into<String>,
+    action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R,
+) -> Result<T, E>
 where
-    F: FnOnce(Waiter<'static, Result<DatabaseInfo, String>>) -> R,
-    R: std::future::Future<Output = anyhow::Result<T>>,
+    R: std::future::Future<Output = Result<T, E>>,
+    E: From<waiters::RegisterError>,
 {
    let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
-    f(waiter).await
+    action(waiter).await
 }

-pub fn notify(psql_session_id: &str, msg: Result<DatabaseInfo, String>) -> anyhow::Result<()> {
+pub fn notify(
+    psql_session_id: &str,
+    msg: Result<DatabaseInfo, String>,
+) -> Result<(), waiters::NotifyError> {
    CPLANE_WAITERS.notify(psql_session_id, msg)
 }

 /// Zenith console API wrapper.
-pub struct CPlaneApi<'a> {
-    auth_endpoint: &'a str,
+pub struct CPlaneApi {
+    auth_endpoint: reqwest::Url,
 }

-impl<'a> CPlaneApi<'a> {
-    pub fn new(auth_endpoint: &'a str) -> Self {
+impl CPlaneApi {
+    pub fn new(auth_endpoint: reqwest::Url) -> Self {
        Self { auth_endpoint }
    }
 }

-impl CPlaneApi<'_> {
-    pub async fn authenticate_proxy_request(
+#[derive(Debug, Error)]
+pub enum AuthErrorImpl {
+    /// Authentication error reported by the console.
+    #[error("Authentication failed: {0}")]
+    AuthFailed(String),
+
+    /// HTTP status (other than 200) returned by the console.
+    #[error("Console responded with an HTTP status: {0}")]
+    HttpStatus(reqwest::StatusCode),
+
+    #[error("Console responded with a malformed JSON: {0}")]
+    MalformedResponse(#[from] serde_json::Error),
+
+    #[error(transparent)]
+    Transport(#[from] reqwest::Error),
+
+    #[error(transparent)]
+    WaiterRegister(#[from] waiters::RegisterError),
+
+    #[error(transparent)]
+    WaiterWait(#[from] waiters::WaitError),
+}
+
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct AuthError(Box<AuthErrorImpl>);
+
+impl AuthError {
+    /// Smart constructor for authentication error reported by `mgmt`.
+    pub fn auth_failed(msg: impl Into<String>) -> Self {
+        AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into())))
+    }
+}
+
+impl<T> From<T> for AuthError
+where
+    AuthErrorImpl: From<T>,
+{
+    fn from(e: T) -> Self {
+        AuthError(Box::new(e.into()))
+    }
+}
+
+impl UserFacingError for AuthError {
+    fn to_string_client(&self) -> String {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            AuthFailed(_) | HttpStatus(_) => self.to_string(),
+            _ => "Internal error".to_string(),
+        }
+    }
+}
+
+impl CPlaneApi {
+    pub async fn authenticate_proxy_client(
        &self,
        creds: ClientCredentials,
-        md5_response: &[u8],
+        md5_response: &str,
        salt: &[u8; 4],
        psql_session_id: &str,
-    ) -> anyhow::Result<DatabaseInfo> {
-        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
+    ) -> Result<DatabaseInfo, AuthError> {
+        let mut url = self.auth_endpoint.clone();
        url.query_pairs_mut()
            .append_pair("login", &creds.user)
            .append_pair("database", &creds.dbname)
-            .append_pair("md5response", std::str::from_utf8(md5_response)?)
+            .append_pair("md5response", md5_response)
            .append_pair("salt", &hex::encode(salt))
            .append_pair("psql_session_id", psql_session_id);

@@ -55,18 +116,20 @@ impl CPlaneApi<'_> {
            // TODO: leverage `reqwest::Client` to reuse connections
            let resp = reqwest::get(url).await?;
            if !resp.status().is_success() {
-                bail!("Auth failed: {}", resp.status())
+                return Err(AuthErrorImpl::HttpStatus(resp.status()).into());
            }

            let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
            println!("got auth info: #{:?}", auth_info);

            use ProxyAuthResponse::*;
-            match auth_info {
-                Ready { conn_info } => Ok(conn_info),
-                Error { error } => bail!(error),
-                NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)),
-            }
+            let db_info = match auth_info {
+                Ready { conn_info } => conn_info,
+                Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()),
+                NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?,
+            };
+
+            Ok(db_info)
        })
        .await
    }
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -0,0 +1,17 @@
+/// Marks errors that may be safely shown to a client.
+/// This trait can be seen as a specialized version of [`ToString`].
+///
+/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
+/// is way too convenient and tends to proliferate all across the codebase,
+/// ultimately leading to accidental leaks of sensitive data.
+pub trait UserFacingError: ToString {
+    /// Format the error for client, stripping all sensitive info.
+    ///
+    /// Although this might be a no-op for many types, it's highly
+    /// recommended to override the default impl in case error type
+    /// contains anything sensitive: various IDs, IP addresses etc.
+    #[inline(always)]
+    fn to_string_client(&self) -> String {
+        self.to_string()
+    }
+}
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,7 +7,7 @@ use zenith_utils::http::json::json_response;
 use zenith_utils::http::{RouterBuilder, RouterService};

 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
-    Ok(json_response(StatusCode::OK, "")?)
+    json_response(StatusCode::OK, "")
 }

 fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -20,13 +20,14 @@ mod cancellation;
 mod compute;
 mod config;
 mod cplane_api;
+mod error;
 mod http;
 mod mgmt;
 mod proxy;
 mod stream;
 mod waiters;

-/// Flattens Result<Result<T>> into Result<T>.
+/// Flattens `Result<Result<T>>` into `Result<T>`.
 async fn flatten_err(
    f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
 ) -> anyhow::Result<()> {
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -79,6 +79,18 @@ enum PsqlSessionResult {
    Failure(String),
 }

+/// A message received by `mgmt` when a compute node is ready.
+pub type ComputeReady = Result<DatabaseInfo, String>;
+
+impl PsqlSessionResult {
+    fn into_compute_ready(self) -> ComputeReady {
+        match self {
+            Self::Success(db_info) => Ok(db_info),
+            Self::Failure(message) => Err(message),
+        }
+    }
+}
+
 impl postgres_backend::Handler for MgmtHandler {
    fn process_query(
        &mut self,
@@ -99,13 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R

    let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;

-    use PsqlSessionResult::*;
-    let msg = match resp.result {
-        Success(db_info) => Ok(db_info),
-        Failure(message) => Err(message),
-    };
-
-    match cplane_api::notify(&resp.session_id, msg) {
+    match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) {
        Ok(()) => {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,17 +1,18 @@
 use crate::auth;
-use crate::cancellation::{self, CancelClosure, CancelMap};
-use crate::compute::DatabaseInfo;
+use crate::cancellation::{self, CancelMap};
 use crate::config::{ProxyConfig, TlsConfig};
 use crate::stream::{MetricsStream, PqStream, Stream};
 use anyhow::{bail, Context};
+use futures::TryFutureExt;
 use lazy_static::lazy_static;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::net::TcpStream;
-use tokio_postgres::NoTls;
 use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
 use zenith_utils::pq_proto::{BeMessage as Be, *};

+const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+const ERR_PROTO_VIOLATION: &str = "protocol violation";
+
 lazy_static! {
    static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
        new_common_metric_name("num_connections_accepted"),
@@ -30,6 +31,7 @@ lazy_static! {
    .unwrap();
 }

+/// A small combinator for pluggable error logging.
 async fn log_error<R, F>(future: F) -> F::Output
 where
    F: std::future::Future<Output = anyhow::Result<R>>,
@@ -76,20 +78,21 @@ async fn handle_client(
    }

    let tls = config.tls_config.clone();
-    if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? {
-        cancel_map
-            .with_session(|session| async {
-                connect_client_to_db(config, session, client, creds).await
-            })
-            .await?;
-    }
+    let (stream, creds) = match handshake(stream, tls, cancel_map).await? {
+        Some(x) => x,
+        None => return Ok(()), // it's a cancellation request
+    };

-    Ok(())
+    let client = Client::new(stream, creds);
+    cancel_map
+        .with_session(|session| client.connect_to_db(config, session))
+        .await
 }

-/// Handle a connection from one client.
-/// For better testing experience, `stream` can be
-/// any object satisfying the traits.
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to updgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
 async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mut tls: Option<TlsConfig>,
@@ -119,7 +122,7 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        stream = PqStream::new(stream.into_inner().upgrade(tls).await?);
                    }
                }
-                _ => bail!("protocol violation"),
+                _ => bail!(ERR_PROTO_VIOLATION),
            },
            GssEncRequest => match stream.get_ref() {
                Stream::Raw { .. } if !tried_gss => {
@@ -128,18 +131,21 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                    // Currently, we don't support GSSAPI
                    stream.write_message(&Be::EncryptionResponse(false)).await?;
                }
-                _ => bail!("protocol violation"),
+                _ => bail!(ERR_PROTO_VIOLATION),
            },
            StartupMessage { params, .. } => {
                // Check that the config has been consumed during upgrade
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
-                    let msg = "connection is insecure (try using `sslmode=require`)";
-                    stream.write_message(&Be::ErrorResponse(msg)).await?;
-                    bail!(msg);
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
                }

-                break Ok(Some((stream, params.try_into()?)));
+                // Here and forth: `or_else` demands that we use a future here
+                let creds = async { params.try_into() }
+                    .or_else(|e| stream.throw_error(e))
+                    .await?;
+
+                break Ok(Some((stream, creds)));
            }
            CancelRequest(cancel_key_data) => {
                cancel_map.cancel_session(cancel_key_data).await?;
@@ -150,58 +156,60 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    }
 }

-async fn connect_client_to_db(
-    config: &ProxyConfig,
-    session: cancellation::Session<'_>,
-    mut client: PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+/// Thin connection context.
+struct Client<S> {
+    /// The underlying libpq protocol stream.
+    stream: PqStream<S>,
+    /// Client credentials that we care about.
    creds: auth::ClientCredentials,
-) -> anyhow::Result<()> {
-    let db_info = creds.authenticate(config, &mut client).await?;
-    let (db, version, cancel_closure) = connect_to_db(db_info).await?;
-    let cancel_key_data = session.enable_cancellation(cancel_closure);
-
-    client
-        .write_message_noflush(&BeMessage::ParameterStatus(
-            BeParameterStatusMessage::ServerVersion(&version),
-        ))?
-        .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
-        .write_message(&BeMessage::ReadyForQuery)
-        .await?;
-
-    // This function will be called for writes to either direction.
-    fn inc_proxied(cnt: usize) {
-        // Consider inventing something more sophisticated
-        // if this ever becomes a bottleneck (cacheline bouncing).
-        NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64);
-    }
-
-    let mut db = MetricsStream::new(db, inc_proxied);
-    let mut client = MetricsStream::new(client.into_inner(), inc_proxied);
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;
-
-    Ok(())
 }

-/// Connect to a corresponding compute node.
-async fn connect_to_db(
-    db_info: DatabaseInfo,
-) -> anyhow::Result<(TcpStream, String, CancelClosure)> {
-    // TODO: establish a secure connection to the DB
-    let socket_addr = db_info.socket_addr()?;
-    let mut socket = TcpStream::connect(socket_addr).await?;
+impl<S> Client<S> {
+    /// Construct a new connection context.
+    fn new(stream: PqStream<S>, creds: auth::ClientCredentials) -> Self {
+        Self { stream, creds }
+    }
+}

-    let (client, conn) = tokio_postgres::Config::from(db_info)
-        .connect_raw(&mut socket, NoTls)
-        .await?;
+impl<S: AsyncRead + AsyncWrite + Unpin> Client<S> {
+    /// Let the client authenticate and connect to the designated compute node.
+    async fn connect_to_db(
+        self,
+        config: &ProxyConfig,
+        session: cancellation::Session<'_>,
+    ) -> anyhow::Result<()> {
+        let Self { mut stream, creds } = self;

-    let version = conn
-        .parameter("server_version")
-        .context("failed to fetch postgres server version")?
-        .into();
+        // Authenticate and connect to a compute node.
+        let auth = creds.authenticate(config, &mut stream).await;
+        let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?;

-    let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
+        let (db, version, cancel_closure) =
+            db_info.connect().or_else(|e| stream.throw_error(e)).await?;
+        let cancel_key_data = session.enable_cancellation(cancel_closure);

-    Ok((socket, version, cancel_closure))
+        stream
+            .write_message_noflush(&BeMessage::ParameterStatus(
+                BeParameterStatusMessage::ServerVersion(&version),
+            ))?
+            .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
+            .write_message(&BeMessage::ReadyForQuery)
+            .await?;
+
+        /// This function will be called for writes to either direction.
+        fn inc_proxied(cnt: usize) {
+            // Consider inventing something more sophisticated
+            // if this ever becomes a bottleneck (cacheline bouncing).
+            NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64);
+        }
+
+        // Starting from here we only proxy the client's traffic.
+        let mut db = MetricsStream::new(db, inc_proxied);
+        let mut client = MetricsStream::new(stream.into_inner(), inc_proxied);
+        let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;
+
+        Ok(())
+    }
 }

 #[cfg(test)]
@@ -210,7 +218,7 @@ mod tests {

    use tokio::io::DuplexStream;
    use tokio_postgres::config::SslMode;
-    use tokio_postgres::tls::MakeTlsConnect;
+    use tokio_postgres::tls::{MakeTlsConnect, NoTls};
    use tokio_postgres_rustls::MakeRustlsConnect;

    async fn dummy_proxy(
@@ -264,7 +272,7 @@ mod tests {

        let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into())));

-        tokio_postgres::Config::new()
+        let client_err = tokio_postgres::Config::new()
            .user("john_doe")
            .dbname("earth")
            .ssl_mode(SslMode::Disable)
@@ -273,11 +281,15 @@ mod tests {
            .err() // -> Option<E>
            .context("client shouldn't be able to connect")?;

-        proxy
+        assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION));
+
+        let server_err = proxy
            .await?
            .err() // -> Option<E>
            .context("server shouldn't accept client")?;

+        assert!(client_err.to_string().contains(&server_err.to_string()));
+
        Ok(())
    }

@@ -329,4 +341,30 @@ mod tests {

        proxy.await?
    }
+
+    #[tokio::test]
+    async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> {
+        let (client, server) = tokio::io::duplex(1024);
+
+        let proxy = tokio::spawn(dummy_proxy(client, None));
+
+        let client_err = tokio_postgres::Config::new()
+            .ssl_mode(SslMode::Disable)
+            .connect_raw(server, NoTls)
+            .await
+            .err() // -> Option<E>
+            .context("client shouldn't be able to connect")?;
+
+        // TODO: this is ugly, but `format!` won't allow us to extract fmt string
+        assert!(client_err.to_string().contains("missing in startup packet"));
+
+        let server_err = proxy
+            .await?
+            .err() // -> Option<E>
+            .context("server shouldn't accept client")?;
+
+        assert!(client_err.to_string().contains(&server_err.to_string()));
+
+        Ok(())
+    }
 }
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,10 +1,12 @@
-use anyhow::Context;
+use crate::error::UserFacingError;
+use anyhow::bail;
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::{io, task};
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket};
@@ -35,38 +37,63 @@ impl<S> PqStream<S> {
        self.stream
    }

-    /// Get a reference to the underlying stream.
+    /// Get a shared reference to the underlying stream.
    pub fn get_ref(&self) -> &S {
        &self.stream
    }
 }

+fn err_connection() -> io::Error {
+    io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
+}
+
+// TODO: change error type of `FeMessage::read_fut`
+fn from_anyhow(e: anyhow::Error) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, e.to_string())
+}
+
 impl<S: AsyncRead + Unpin> PqStream<S> {
    /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
-    pub async fn read_startup_packet(&mut self) -> anyhow::Result<FeStartupPacket> {
-        match FeStartupPacket::read_fut(&mut self.stream).await? {
-            Some(FeMessage::StartupPacket(packet)) => Ok(packet),
-            None => anyhow::bail!("connection is lost"),
-            other => anyhow::bail!("bad message type: {:?}", other),
+    pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
+        // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
+        let msg = FeStartupPacket::read_fut(&mut self.stream)
+            .await
+            .map_err(from_anyhow)?
+            .ok_or_else(err_connection)?;
+
+        match msg {
+            FeMessage::StartupPacket(packet) => Ok(packet),
+            _ => panic!("unreachable state"),
        }
    }

-    pub async fn read_message(&mut self) -> anyhow::Result<FeMessage> {
+    pub async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
+        match self.read_message().await? {
+            FeMessage::PasswordMessage(msg) => Ok(msg),
+            bad => Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("unexpected message type: {:?}", bad),
+            )),
+        }
+    }
+
+    async fn read_message(&mut self) -> io::Result<FeMessage> {
        FeMessage::read_fut(&mut self.stream)
-            .await?
-            .context("connection is lost")
+            .await
+            .map_err(from_anyhow)?
+            .ok_or_else(err_connection)
    }
 }

 impl<S: AsyncWrite + Unpin> PqStream<S> {
    /// Write the message into an internal buffer, but don't flush the underlying stream.
-    pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> {
+    pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
        BeMessage::write(&mut self.buffer, message)?;
        Ok(self)
    }

    /// Write the message into an internal buffer and flush it.
-    pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> {
+    pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
        self.write_message_noflush(message)?;
        self.flush().await?;
        Ok(self)
@@ -79,6 +106,25 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
        self.stream.flush().await?;
        Ok(self)
    }
+
+    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Allowing string literals is safe under the assumption they might not contain any runtime info.
+    pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
+        // This method exists due to `&str` not implementing `Into<anyhow::Error>`
+        self.write_message(&BeMessage::ErrorResponse(error)).await?;
+        bail!(error)
+    }
+
+    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Trait [`UserFacingError`] acts as an allowlist for error types.
+    pub async fn throw_error<T, E>(&mut self, error: E) -> anyhow::Result<T>
+    where
+        E: UserFacingError + Into<anyhow::Error>,
+    {
+        let msg = error.to_string_client();
+        self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
+        bail!(error)
+    }
 }

 pin_project! {
@@ -101,15 +147,25 @@ impl<S> Stream<S> {
    }
 }

+#[derive(Debug, Error)]
+#[error("Can't upgrade TLS stream")]
+pub enum StreamUpgradeError {
+    #[error("Bad state reached: can't upgrade TLS stream")]
+    AlreadyTls,
+
+    #[error("Can't upgrade stream: IO error: {0}")]
+    Io(#[from] io::Error),
+}
+
 impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
    /// If possible, upgrade raw stream into a secure TLS-based stream.
-    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> anyhow::Result<Self> {
+    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<Self, StreamUpgradeError> {
        match self {
            Stream::Raw { raw } => {
                let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?);
                Ok(Stream::Tls { tls })
            }
-            Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"),
+            Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
        }
    }
 }
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -1,11 +1,32 @@
-use anyhow::{anyhow, Context};
 use hashbrown::HashMap;
 use parking_lot::Mutex;
 use pin_project_lite::pin_project;
 use std::pin::Pin;
 use std::task;
+use thiserror::Error;
 use tokio::sync::oneshot;

+#[derive(Debug, Error)]
+pub enum RegisterError {
+    #[error("Waiter `{0}` already registered")]
+    Occupied(String),
+}
+
+#[derive(Debug, Error)]
+pub enum NotifyError {
+    #[error("Notify failed: waiter `{0}` not registered")]
+    NotFound(String),
+
+    #[error("Notify failed: channel hangup")]
+    Hangup,
+}
+
+#[derive(Debug, Error)]
+pub enum WaitError {
+    #[error("Wait failed: channel hangup")]
+    Hangup,
+}
+
 pub struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);

 impl<T> Default for Waiters<T> {
@@ -15,13 +36,13 @@ impl<T> Default for Waiters<T> {
 }

 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> anyhow::Result<Waiter<T>> {
+    pub fn register(&self, key: String) -> Result<Waiter<T>, RegisterError> {
        let (tx, rx) = oneshot::channel();

        self.0
            .lock()
            .try_insert(key.clone(), tx)
-            .map_err(|_| anyhow!("waiter already registered"))?;
+            .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?;

        Ok(Waiter {
            receiver: rx,
@@ -32,7 +53,7 @@ impl<T> Waiters<T> {
        })
    }

-    pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()>
+    pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError>
    where
        T: Send + Sync,
    {
@@ -40,9 +61,9 @@ impl<T> Waiters<T> {
            .0
            .lock()
            .remove(key)
-            .with_context(|| format!("key {} not found", key))?;
+            .ok_or_else(|| NotifyError::NotFound(key.to_string()))?;

-        tx.send(value).map_err(|_| anyhow!("waiter channel hangup"))
+        tx.send(value).map_err(|_| NotifyError::Hangup)
    }
 }

@@ -66,13 +87,13 @@ pin_project! {
 }

 impl<T> std::future::Future for Waiter<'_, T> {
-    type Output = anyhow::Result<T>;
+    type Output = Result<T, WaitError>;

    fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll<Self::Output> {
        self.project()
            .receiver
            .poll(cx)
-            .map_err(|_| anyhow!("channel hangup"))
+            .map_err(|_| WaitError::Hangup)
    }
 }

--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -1,131 +0,0 @@
-from contextlib import closing
-import psycopg2.extras
-import time
-from fixtures.utils import print_gc_result
-from fixtures.zenith_fixtures import ZenithEnv
-from fixtures.log_helper import log
-
-
-#
-# Test Garbage Collection of old layer files
-#
-# This test is pretty tightly coupled with the current implementation of layered
-# storage, in layered_repository.rs.
-#
-def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
-    env.zenith_cli.create_branch("test_layerfiles_gc", "empty")
-    pg = env.postgres.create_start('test_layerfiles_gc')
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            with closing(env.pageserver.connect()) as psconn:
-                with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
-
-                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
-                    cur.execute("SHOW zenith.zenith_timeline")
-                    timeline = cur.fetchone()[0]
-
-                    # Create a test table
-                    cur.execute("CREATE TABLE foo(x integer)")
-                    cur.execute("INSERT INTO foo VALUES (1)")
-
-                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
-                    row = cur.fetchone()
-                    log.info(f"relfilenode is {row[0]}")
-
-                    # Run GC, to clear out any garbage left behind in the catalogs by
-                    # the CREATE TABLE command. We want to have a clean slate with no garbage
-                    # before running the actual tests below, otherwise the counts won't match
-                    # what we expect.
-                    #
-                    # Also run vacuum first to make it less likely that autovacuum or pruning
-                    # kicks in and confuses our numbers.
-                    cur.execute("VACUUM")
-
-                    # delete the row, to update the Visibility Map. We don't want the VM
-                    # update to confuse our numbers either.
-                    cur.execute("DELETE FROM foo")
-
-                    log.info("Running GC before test")
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-                    # remember the number of files
-                    layer_relfiles_remain = (row['layer_relfiles_total'] -
-                                             row['layer_relfiles_removed'])
-                    assert layer_relfiles_remain > 0
-
-                    # Insert a row and run GC. Checkpoint should freeze the layer
-                    # so that there is only the most recent image layer left for the rel,
-                    # removing the old image and delta layer.
-                    log.info("Inserting one row and running GC")
-                    cur.execute("INSERT INTO foo VALUES (1)")
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
-                    assert row['layer_relfiles_removed'] == 2
-                    assert row['layer_relfiles_dropped'] == 0
-
-                    # Insert two more rows and run GC.
-                    # This should create new image and delta layer file with the new contents, and
-                    # then remove the old one image and the just-created delta layer.
-                    log.info("Inserting two more rows and running GC")
-                    cur.execute("INSERT INTO foo VALUES (2)")
-                    cur.execute("INSERT INTO foo VALUES (3)")
-
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
-                    assert row['layer_relfiles_removed'] == 2
-                    assert row['layer_relfiles_dropped'] == 0
-
-                    # Do it again. Should again create two new layer files and remove old ones.
-                    log.info("Inserting two more rows and running GC")
-                    cur.execute("INSERT INTO foo VALUES (2)")
-                    cur.execute("INSERT INTO foo VALUES (3)")
-
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
-                    assert row['layer_relfiles_removed'] == 2
-                    assert row['layer_relfiles_dropped'] == 0
-
-                    # Run GC again, with no changes in the database. Should not remove anything.
-                    log.info("Run GC again, with nothing to do")
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain
-                    assert row['layer_relfiles_removed'] == 0
-                    assert row['layer_relfiles_dropped'] == 0
-
-                    #
-                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
-                    #
-                    log.info("Drop table and run GC again")
-                    cur.execute("DROP TABLE foo")
-
-                    pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
-                    row = pscur.fetchone()
-                    print_gc_result(row)
-
-                    # We still cannot remove the latest layers
-                    # because they serve as tombstones for earlier layers.
-                    assert row['layer_relfiles_dropped'] == 0
-                    # Each relation fork is counted separately, hence 3.
-                    assert row['layer_relfiles_needed_as_tombstone'] == 3
-
-                    # The catalog updates also create new layer files of the catalogs, which
-                    # are counted as 'removed'
-                    assert row['layer_relfiles_removed'] > 0
-
-                    # TODO Change the test to check actual CG of dropped layers.
-                    # Each relation fork is counted separately, hence 3.
-                    #assert row['layer_relfiles_dropped'] == 3
-
-                    # TODO: perhaps we should count catalog and user relations separately,
-                    # to make this kind of testing more robust
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int:
 def print_gc_result(row):
    log.info("GC duration {elapsed} ms".format_map(row))
    log.info(
-        "  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
-        .format_map(row))
-    log.info(
-        "  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
+        "  total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
        .format_map(row))
--- a/test_runner/performance/test_pageserver.py
+++ b/test_runner/performance/test_pageserver.py
@@ -19,10 +19,10 @@ def test_get_page(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker)
            cur.execute('create table t (i integer);')
            cur.execute('insert into t values (0);')

-            for i in range(1000):
+            for i in range(100000):
                cur.execute(f'update t set i = {i};')

-            pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
+            pscur.execute(f"checkpoint {env.initial_tenant.hex} {timeline} 0")

            cur.execute("select * from t;")
            res = cur.fetchall()
--- a/test_runner/performance/test_small_seqscans.py
+++ b/test_runner/performance/test_small_seqscans.py
@@ -1,8 +1,5 @@
 # Test sequential scan speed
 #
-# The test table is large enough (3-4 MB) that it doesn't fit in the compute node
-# cache, so the seqscans go to the page server. But small enough that it fits
-# into memory in the page server.
 from contextlib import closing
 from dataclasses import dataclass
 from fixtures.zenith_fixtures import ZenithEnv
@@ -12,11 +9,18 @@ from fixtures.compare_fixtures import PgCompare
 import pytest


-@pytest.mark.parametrize('rows', [
-    pytest.param(100000),
-    pytest.param(1000000, marks=pytest.mark.slow),
-])
-def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int):
+@pytest.mark.parametrize(
+    'rows,iters,workers',
+    [
+        # The test table is large enough (3-4 MB) that it doesn't fit in the compute node
+        # cache, so the seqscans go to the page server. But small enough that it fits
+        # into memory in the page server.
+        pytest.param(100000, 100, 0),
+        # Also test with a larger table, with and without parallelism
+        pytest.param(10000000, 1, 0, marks=pytest.mark.slow),
+        pytest.param(10000000, 1, 4, marks=pytest.mark.slow)
+    ])
+def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int):
    env = zenith_with_baseline

    with closing(env.pg.connect()) as conn:
@@ -36,6 +40,8 @@ def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int):
            assert int(shared_buffers) < int(table_size)
            env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM)

+            cur.execute(f"set max_parallel_workers_per_gather = {workers}")
+
            with env.record_duration('run'):
-                for i in range(1000):
+                for i in range(iters):
                    cur.execute('select count(*) from t;')
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/zenith_utils/Cargo.toml
+++ b/zenith_utils/Cargo.toml
@@ -37,3 +37,8 @@ bytes = "1.0.1"
 hex-literal = "0.3"
 tempfile = "3.2"
 webpki = "0.21"
+criterion = "0.3"
+
+[[bench]]
+name = "benchmarks"
+harness = false
--- a/zenith_utils/benches/benchmarks.rs
+++ b/zenith_utils/benches/benchmarks.rs
@@ -0,0 +1,22 @@
+#![allow(unused)]
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use zenith_utils::zid;
+
+pub fn bench_zid_stringify(c: &mut Criterion) {
+    // Can only use public methods.
+    let ztl = zid::ZTenantTimelineId::generate();
+
+    c.bench_function("zid.to_string", |b| {
+        b.iter(|| {
+            // FIXME measurement overhead?
+            //for _ in 0..1000 {
+            //    ztl.tenant_id.to_string();
+            //}
+            ztl.tenant_id.to_string();
+        })
+    });
+}
+
+criterion_group!(benches, bench_zid_stringify);
+criterion_main!(benches);
--- a/zenith_utils/src/pq_proto.rs
+++ b/zenith_utils/src/pq_proto.rs
@@ -425,7 +425,7 @@ pub enum BeMessage<'a> {
    ReadyForQuery,
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
-    NoticeResponse(String),
+    NoticeResponse(&'a str),
    KeepAlive(WalSndKeepAlive),
 }

--- a/zenith_utils/src/zid.rs
+++ b/zenith_utils/src/zid.rs
@@ -112,6 +112,17 @@ impl ZId {
        rand::thread_rng().fill(&mut tli_buf);
        ZId::from(tli_buf)
    }
+
+    fn hex_encode(&self) -> String {
+        static HEX: &[u8] = b"0123456789abcdef";
+
+        let mut buf = vec![0u8; self.0.len() * 2];
+        for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) {
+            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
+            chunk[1] = HEX[(b & 0xf) as usize];
+        }
+        unsafe { String::from_utf8_unchecked(buf) }
+    }
 }

 impl FromStr for ZId {
@@ -147,13 +158,13 @@ impl From<[u8; 16]> for ZId {

 impl fmt::Display for ZId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_str(&hex::encode(self.0))
+        f.write_str(&self.hex_encode())
    }
 }

 impl fmt::Debug for ZId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_str(&hex::encode(self.0))
+        f.write_str(&self.hex_encode())
    }
 }