Add assertion

Try im (benchmarks panic)
Migrate update version to management APIv2 (#3430 )
2026-03-04 00:40:38 +00:00 · 2023-01-24 15:23:27 -05:00 · 2023-01-24 14:36:22 -05:00 · 2023-01-24 17:18:16 +01:00 · 2023-01-24 18:03:33 +02:00 · 2023-01-24 17:23:33 +02:00
41 changed files with 2354 additions and 939 deletions
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
      tags:
      - pageserver

@@ -186,6 +187,7 @@
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
      tags:
      - safekeeper
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -29,6 +29,8 @@ storage:
          ansible_host: i-0565a8b4008aa3f40
        pageserver-2.us-east-2.aws.neon.build:
          ansible_host: i-01e31cdf7e970586a
+        pageserver-3.us-east-2.aws.neon.build:
+          ansible_host: i-0602a0291365ef7cc

    safekeepers:
      hosts:
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 /pg_install
 /target
+/tmp_check
+/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
 "backtrace",
 ]

+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -606,6 +601,15 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

+[[package]]
+name = "bitmaps"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "031043d04099746d8db04daf1fa424b2bc8bd69d92b25962dcde24da39ab64a2"
+dependencies = [
+ "typenum",
+]
+
 [[package]]
 name = "block-buffer"
 version = "0.10.3"
@@ -629,9 +633,9 @@ dependencies = [

 [[package]]
 name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"

 [[package]]
 name = "byteorder"
@@ -750,13 +754,13 @@ dependencies = [

 [[package]]
 name = "clap"
-version = "4.0.32"
+version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
 dependencies = [
 "bitflags",
 "clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
 "is-terminal",
 "once_cell",
 "strsim",
@@ -765,9 +769,9 @@ dependencies = [

 [[package]]
 name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
 dependencies = [
 "heck",
 "proc-macro-error",
@@ -787,9 +791,9 @@ dependencies = [

 [[package]]
 name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
 dependencies = [
 "os_str_bytes",
 ]
@@ -832,7 +836,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
 "futures",
 "hyper",
 "notify",
@@ -887,7 +891,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
 "comfy-table",
 "git-version",
 "nix",
@@ -988,12 +992,6 @@ dependencies = [
 "itertools",
 ]

-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
@@ -1030,12 +1028,11 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
 dependencies = [
 "cfg-if",
- "once_cell",
 ]

 [[package]]
@@ -1506,15 +1503,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"

-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1530,19 +1518,6 @@ dependencies = [
 "ahash",
 ]

-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.0"
@@ -1762,6 +1737,20 @@ dependencies = [
 "unicode-normalization",
 ]

+[[package]]
+name = "im"
+version = "15.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0acd33ff0285af998aaf9b57342af478078f53492322fafc47450e09397e0e9"
+dependencies = [
+ "bitmaps",
+ "rand_core",
+ "rand_xoshiro",
+ "sized-chunks",
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "indexmap"
 version = "1.9.2"
@@ -1804,9 +1793,9 @@ dependencies = [

 [[package]]
 name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
 "libc",
 "windows-sys",
@@ -1916,12 +1905,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
 [[package]]
 name = "link-cplusplus"
 version = "1.0.8"
@@ -2067,9 +2050,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

 [[package]]
 name = "nix"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -2081,9 +2064,9 @@ dependencies = [

 [[package]]
 name = "nom"
-version = "7.1.2"
+version = "7.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 dependencies = [
 "memchr",
 "minimal-lexical",
@@ -2154,7 +2137,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
- "libm",
 ]

 [[package]]
@@ -2230,14 +2212,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 name = "pageserver"
 version = "0.1.0"
 dependencies = [
- "amplify_num",
 "anyhow",
 "async-stream",
 "async-trait",
 "byteorder",
 "bytes",
 "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
 "close_fds",
 "const_format",
 "consumption_metrics",
@@ -2252,6 +2233,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper",
+ "im",
 "itertools",
 "metrics",
 "nix",
@@ -2269,7 +2251,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest",
- "rstar",
+ "rpds",
 "scopeguard",
 "serde",
 "serde_json",
@@ -2581,9 +2563,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
 dependencies = [
 "unicode-ident",
 ]
@@ -2683,7 +2665,7 @@ dependencies = [
 "bstr",
 "bytes",
 "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
 "consumption_metrics",
 "futures",
 "git-version",
@@ -2742,14 +2724,13 @@ dependencies = [

 [[package]]
 name = "rand"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "libc",
 "rand_chacha",
 "rand_core",
- "rand_hc",
 ]

 [[package]]
@@ -2772,10 +2753,10 @@ dependencies = [
 ]

 [[package]]
-name = "rand_hc"
-version = "0.3.1"
+name = "rand_xoshiro"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
+checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
 dependencies = [
 "rand_core",
 ]
@@ -2930,7 +2911,7 @@ dependencies = [
 "cc",
 "libc",
 "once_cell",
- "spin 0.5.2",
+ "spin",
 "untrusted",
 "web-sys",
 "winapi",
@@ -2950,14 +2931,12 @@ dependencies = [
 ]

 [[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
 dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
 ]

 [[package]]
@@ -3018,9 +2997,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
 dependencies = [
 "bitflags",
 "errno",
@@ -3093,7 +3072,7 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
 "const_format",
 "crc32c",
 "fs2",
@@ -3448,6 +3427,16 @@ version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"

+[[package]]
+name = "sized-chunks"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16d69225bde7a69b235da73377861095455d298f2b970996eec25ddbb42b3d1e"
+dependencies = [
+ "bitmaps",
+ "typenum",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.7"
@@ -3479,21 +3468,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"

-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3507,7 +3481,7 @@ dependencies = [
 "anyhow",
 "async-stream",
 "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
 "const_format",
 "futures",
 "futures-core",
@@ -3639,9 +3613,9 @@ dependencies = [

 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
 "winapi-util",
 ]
@@ -3749,9 +3723,9 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
 dependencies = [
 "autocfg",
 "bytes",
@@ -4183,9 +4157,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"

 [[package]]
 name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
 dependencies = [
 "base64 0.13.1",
 "log",
@@ -4226,6 +4200,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "atty",
 "bincode",
 "byteorder",
 "bytes",
@@ -4287,7 +4262,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
 "env_logger",
 "log",
 "once_cell",
@@ -4534,7 +4509,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
 "crossbeam-utils",
 "either",
 "fail",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,7 +69,7 @@ rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -107,9 +107,6 @@ x509-parser = "0.14"
 env_logger = "0.10"
 log = "0.4"

-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ impl ComputeNode {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
        //
-        // In this case we need to connect with old `zenith_admin`name
+        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
@@ -278,6 +278,7 @@ impl ComputeNode {
            Ok(client) => client,
        };

+        // Proceed with post-startup configuration. Note, that order of operations is important.
        handle_roles(&self.spec, &mut client)?;
        handle_databases(&self.spec, &mut client)?;
        handle_role_deletions(self, &mut client)?;
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
 pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+//   To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+//   as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+//   (see below), then SCRAM-based authentication will automatically be chosen instead.
+//   https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
 pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -130,8 +130,8 @@ impl Role {
    /// Serialize a list of role parameters into a Postgres-acceptable
    /// string of arguments.
    pub fn to_pg_options(&self) -> String {
-        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
-        // For now we do not use generic `options` for roles. Once used, add
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
+        // For now, we do not use generic `options` for roles. Once used, add
        // `self.options.as_pg_options()` somewhere here.
        let mut params: String = "LOGIN".to_string();

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -152,8 +152,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5:' prefix first)
-                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                // Check whether password changed or not (trim 'md5' prefix first if any)
+                //
+                // This is a backward compatibility hack, which comes from the times when we were using
+                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+                // to place this compatibility layer.
+                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+                    stripped
+                } else {
+                    pg_pwd
+                };
+                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
                    RoleAction::Update
                } else {
                    RoleAction::None
@@ -213,8 +225,20 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
    if let Some(ops) = &node.spec.delta_operations {
        // First, reassign all dependent objects to db owners.
        info!("reassigning dependent objects of to-be-deleted roles");
+
+        // Fetch existing roles. We could've exported and used `existing_roles` from
+        // `handle_roles()`, but we only make this list there before creating new roles.
+        // Which is probably fine as we never create to-be-deleted roles, but that'd
+        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
+        // buffers already, so this shouldn't be a big deal.
+        let mut xact = client.transaction()?;
+        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+        xact.commit()?;
+
        for op in ops {
-            if op.action == "delete_role" {
+            // Check that role is still present in Postgres, as this could be a
+            // restart with the same spec after role deletion.
+            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
                reassign_owned_objects(node, &op.name)?;
            }
        }
--- a/control_plane/.gitignore
+++ b/control_plane/.gitignore
@@ -0,0 +1 @@
+tmp_check/
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -44,18 +44,17 @@ impl TenantState {
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
-    /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
-    /// background jobs are running otherwise they will be launched when the tenant is activated.
+    /// The timeline is recognized by the pageserver but is not yet operational.
+    /// In particular, the walreceiver connection loop is not running for this timeline.
+    /// It will eventually transition to state Active or Broken.
+    Loading,
+    /// The timeline is fully operational.
+    /// It can be queried, and the walreceiver connection loop is running.
    Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
+    /// The timeline was previously Loading or Active but is shutting down.
+    /// It cannot transition back into any other state.
    Stopping,
-    /// A timeline is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The timeline is broken and not operational (previous states: Loading or Active).
    Broken,
 }

--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -134,22 +134,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        op: Cow<'static, str>,
        lsn: u64,
        size: Option<u64>,
-    ) where
+    ) -> anyhow::Result<()>
+    where
        K: std::borrow::Borrow<Q>,
        Q: std::hash::Hash + Eq + std::fmt::Debug,
    {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
        let newseg_id = self.segments.len();
        let lastseg = &mut self.segments[lastseg_id];

        assert!(lsn > lastseg.end_lsn);

+        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
        let newseg = Segment {
            op,
            parent: Some(lastseg_id),
            start_lsn: lastseg.end_lsn,
            end_lsn: lsn,
-            start_size: lastseg.end_size.unwrap(),
+            start_size,
            end_size: size,
            children_after: Vec::new(),
            needed: false,
@@ -158,6 +161,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {

        self.segments.push(newseg);
        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+
+        Ok(())
    }

    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
@@ -167,21 +172,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        op: Cow<'static, str>,
        lsn_bytes: u64,
        size_bytes: i64,
-    ) where
+    ) -> anyhow::Result<()>
+    where
        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
    {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
        let newseg_id = self.segments.len();
        let lastseg = &mut self.segments[lastseg_id];

+        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
        let newseg = Segment {
            op,
            parent: Some(lastseg_id),
            start_lsn: lastseg.end_lsn,
            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size.unwrap(),
-            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
+            start_size: last_end_size,
+            end_size: Some((last_end_size as i64 + size_bytes) as u64),
            children_after: Vec::new(),
            needed: false,
        };
@@ -189,33 +197,33 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {

        self.segments.push(newseg);
        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+        Ok(())
    }

-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
    where
        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
    }

-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
    where
        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64);
+        self.modify_branch(branch, "update".into(), bytes, 0i64)
    }

-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
    where
        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
    }

-    /// Panics if the parent branch cannot be found.
    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
    where
        K: std::borrow::Borrow<Q> + std::fmt::Debug,
@@ -236,7 +244,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        Ok(())
    }

-    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
        // Phase 1: Mark all the segments that need to be retained
        for (_branch, &last_seg_id) in self.branches.iter() {
            let last_seg = &self.segments[last_seg_id];
@@ -261,7 +269,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        self.size_from_snapshot_later(0)
    }

-    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
        let seg = &self.segments[seg_id];

        let this_size = seg.end_lsn - seg.start_lsn;
@@ -272,10 +280,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        for &child_id in seg.children_after.iter() {
            // try each child both ways
            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id);
+            let p1 = self.size_from_wal(child_id)?;

            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id);
+                let p2 = self.size_from_snapshot_later(child_id)?;
                if p1.total() < p2.total() {
                    p1
                } else {
@@ -286,15 +294,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            };
            children.push(p);
        }
-        SegmentSize {
+        Ok(SegmentSize {
            seg_id,
            method: if seg.needed { WalNeeded } else { Wal },
            this_size,
            children,
-        }
+        })
    }

-    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
        // If this is needed, then it's time to do the snapshot and continue
        // with wal method.
        let seg = &self.segments[seg_id];
@@ -305,10 +313,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            for &child_id in seg.children_after.iter() {
                // try each child both ways
                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id);
+                let p1 = self.size_from_wal(child_id)?;

                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id);
+                    let p2 = self.size_from_snapshot_later(child_id)?;
                    if p1.total() < p2.total() {
                        p1
                    } else {
@@ -319,12 +327,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                };
                children.push(p);
            }
-            SegmentSize {
+            Ok(SegmentSize {
                seg_id,
                method: WalNeeded,
                this_size: seg.start_size,
                children,
-            }
+            })
        } else {
            // If any of the direct children are "needed", need to be able to reconstruct here
            let mut children_needed = false;
@@ -339,7 +347,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            let method1 = if !children_needed {
                let mut children = Vec::new();
                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child));
+                    children.push(self.size_from_snapshot_later(*child)?);
                }
                Some(SegmentSize {
                    seg_id,
@@ -355,20 +363,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            let method2 = if children_needed || seg.children_after.len() >= 2 {
                let mut children = Vec::new();
                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child));
+                    children.push(self.size_from_wal(*child)?);
                }
+                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
                Some(SegmentSize {
                    seg_id,
                    method: SnapshotAfter,
-                    this_size: seg.end_size.unwrap(),
+                    this_size,
                    children,
                })
            } else {
                None
            };

-            match (method1, method2) {
-                (None, None) => panic!(),
+            Ok(match (method1, method2) {
+                (None, None) => anyhow::bail!(
+                    "neither method was applicable: children_after={}, children_needed={}",
+                    seg.children_after.len(),
+                    children_needed
+                ),
                (Some(method), None) => method,
                (None, Some(method)) => method,
                (Some(method1), Some(method2)) => {
@@ -378,7 +391,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                        method2
                    }
                }
-            }
+            })
        }
    }

--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -7,118 +7,118 @@
 use tenant_size_model::{Segment, SegmentSize, Storage};

 // Main branch only. Some updates on it.
-fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    // Create main branch
    let mut storage = Storage::new("main");

    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;

    // Stream of updates
    for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

 // Main branch only. Some updates on it.
-fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    // Create main branch
    let mut storage = Storage::new("main");

    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;

    // Stream of updates
    for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

    // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;

    // More updates on parent
-    storage.update("main", 1_000);
+    storage.update("main", 1_000)?;

-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

 // Like 2, but more updates on main
-fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    // Create main branch
    let mut storage = Storage::new("main");

    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;

    // Stream of updates
    for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

    // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;

    // More updates on parent
    for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

 // Diverged branches
-fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    // Create main branch
    let mut storage = Storage::new("main");

    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;

    // Stream of updates
    for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

    // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;

    // More updates on parent
    for _ in 0..8 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
    }

-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

-fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    let mut storage = Storage::new("a");
-    storage.insert("a", 5000);
-    storage.branch("a", "b").unwrap();
-    storage.update("b", 4000);
-    storage.update("a", 2000);
-    storage.branch("a", "c").unwrap();
-    storage.insert("c", 4000);
-    storage.insert("a", 2000);
+    storage.insert("a", 5000)?;
+    storage.branch("a", "b")?;
+    storage.update("b", 4000)?;
+    storage.update("a", 2000)?;
+    storage.branch("a", "c")?;
+    storage.insert("c", 4000)?;
+    storage.insert("a", 2000)?;

-    let size = storage.calculate(5000);
+    let size = storage.calculate(5000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

-fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
    use std::borrow::Cow;

    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
@@ -133,18 +133,18 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {

    let mut storage = Storage::new(None);

-    storage.branch(&None, branches[0]).unwrap(); // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
-    storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
-    storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+    storage.branch(&None, branches[0])?; // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
+    storage.branch(&branches[0], branches[1])?; // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
+    storage.branch(&branches[0], branches[2])?; // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400

-    let size = storage.calculate(100_000);
+    let size = storage.calculate(100_000)?;

-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }

 fn main() {
@@ -163,7 +163,8 @@ fn main() {
            eprintln!("invalid scenario {}", other);
            std::process::exit(1);
        }
-    };
+    }
+    .unwrap();

    graphviz_tree(&segments, &size);
 }
@@ -251,7 +252,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {

 #[test]
 fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
    let truths: &[(u32, ScenarioFn, _)] = &[
        (line!(), scenario_1, 8000),
        (line!(), scenario_2, 9000),
@@ -262,7 +263,7 @@ fn scenarios_return_same_size() {
    ];

    for (line, scenario, expected) in truths {
-        let (_, size) = scenario();
+        let (_, size) = scenario().unwrap();
        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
    }
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
        .with_target(false)
-        .with_ansi(false)
+        .with_ansi(atty::is(atty::Stream::Stdout))
        .with_writer(std::io::stdout);

    match log_format {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 testing = ["fail/failpoints"]

 [dependencies]
-amplify_num.workspace = true
 anyhow.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 regex.workspace = true
-rstar.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,8 @@ tenant_size_model.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
+rpds.workspace = true
+im = "15.1.0"

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;

 use criterion::{criterion_group, criterion_main, Criterion};

-struct DummyDelta {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-}
-
-impl Layer for DummyDelta {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-struct DummyImage {
-    key_range: Range<Key>,
-    lsn: Lsn,
-}
-
-impl Layer for DummyImage {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
-    let mut layer_map = LayerMap::<dyn Layer>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();

    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);

    let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();

+    let mut updates = layer_map.batch_update();
    for fname in filenames {
        let fname = &fname.unwrap();
        if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            let layer = DummyImage {
-                key_range: imgfilename.key_range,
-                lsn: imgfilename.lsn,
+            let layer = LayerDescriptor {
+                key: imgfilename.key_range,
+                lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+                is_incremental: false,
+                short_id: fname.to_string(),
            };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
            min_lsn = min(min_lsn, imgfilename.lsn);
            max_lsn = max(max_lsn, imgfilename.lsn);
        } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            let layer = DummyDelta {
-                key_range: deltafilename.key_range,
-                lsn_range: deltafilename.lsn_range.clone(),
+            let layer = LayerDescriptor {
+                key: deltafilename.key_range.clone(),
+                lsn: deltafilename.lsn_range.clone(),
+                is_incremental: true,
+                short_id: fname.to_string(),
            };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
            min_lsn = min(min_lsn, deltafilename.lsn_range.start);
            max_lsn = max(max_lsn, deltafilename.lsn_range.end);
        } else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {

    println!("min: {min_lsn}, max: {max_lsn}");

+    updates.flush();
    layer_map
 }

 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
        .collect()
 }

+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+    let mut parts = Vec::new();
+
+    // We add a partition boundary at the start of each image layer,
+    // no matter what lsn range it covers. This is just the easiest
+    // thing to do. A better thing to do would be to get a real
+    // partitioning from some database. Even better, remove the need
+    // for key partitions by deciding where to create image layers
+    // directly based on a coverage-based difficulty map.
+    let mut keys: Vec<_> = layer_map
+        .iter_historic_layers()
+        .filter_map(|l| {
+            if l.is_incremental() {
+                None
+            } else {
+                let kr = l.get_key_range();
+                Some(kr.start.next())
+            }
+        })
+        .collect();
+    keys.sort();
+
+    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+    for key in keys {
+        parts.push(KeySpace {
+            ranges: vec![current_key..key],
+        });
+        current_key = key;
+    }
+
+    KeyPartitioning { parts }
+}
+
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
 // Benchmark using metadata extracted from a real project that was taknig
 // too long processing layer map queries.
 fn bench_from_real_project(c: &mut Criterion) {
-    // TODO consider compressing this file
+    // Init layer map
+    let now = Instant::now();
    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    println!("Finished layer map init in {:?}", now.elapsed());
+
+    // Choose uniformly distributed queries
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

-    // Test with uniform query pattern
-    c.bench_function("real_map_uniform_queries", |b| {
+    // Choose inputs for get_difficulty_map
+    let latest_lsn = layer_map
+        .iter_historic_layers()
+        .map(|l| l.get_lsn_range().end)
+        .max()
+        .unwrap();
+    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+    // Check correctness of get_difficulty_map
+    // TODO put this in a dedicated test outside of this mod
+    {
+        println!("running correctness check");
+
+        let now = Instant::now();
+        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+        assert!(result_bruteforce.len() == partitioning.parts.len());
+        println!("Finished bruteforce in {:?}", now.elapsed());
+
+        let now = Instant::now();
+        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+        assert!(result_fast.len() == partitioning.parts.len());
+        println!("Finished fast in {:?}", now.elapsed());
+
+        // Assert results are equal. Manually iterate for easier debugging.
+        let zip = std::iter::zip(
+            &partitioning.parts,
+            std::iter::zip(result_bruteforce, result_fast),
+        );
+        for (_part, (bruteforce, fast)) in zip {
+            assert_eq!(bruteforce, fast);
+        }
+
+        println!("No issues found");
+    }
+
+    // Define and name the benchmark function
+    let mut group = c.benchmark_group("real_map");
+    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
                layer_map.search(q.0, q.1);
            }
        });
    });
+    group.bench_function("get_difficulty_map", |b| {
+        b.iter(|| {
+            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+        });
+    });
+    group.finish();
 }

 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
-
    // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
    //
    // TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
    //      Putting it inside the `bench_function` closure is not a solution
    //      because then it runs multiple times during warmup.
    let now = Instant::now();
+    let mut layer_map = LayerMap::default();
+    let mut updates = layer_map.batch_update();
    for i in 0..100_000 {
-        // TODO try inserting a super-wide layer in between every 10 to reflect
-        //      what often happens with L1 layers that include non-rel changes.
-        //      Maybe do that as a separate test.
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = DummyImage {
-            key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(10 * i),
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
        };
-        layer_map.insert_historic(Arc::new(layer));
+        updates.insert_historic(Arc::new(layer));
    }
-
-    // Manually measure runtime without criterion because criterion
-    // has a minimum sample size of 10 and I don't want to run it 10 times.
-    println!("Finished init in {:?}", now.elapsed());
+    updates.flush();
+    println!("Finished layer map init in {:?}", now.elapsed());

    // Choose 100 uniformly random queries
    let rng = &mut StdRng::seed_from_u64(1);
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
-        .choose_multiple(rng, 1)
+        .choose_multiple(rng, 100)
        .copied()
        .collect();

    // Define and name the benchmark function
-    c.bench_function("sequential_uniform_queries", |b| {
-        // Run the search queries
+    let mut group = c.benchmark_group("sequential");
+    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
                layer_map.search(q.0, q.1);
            }
        });
    });
+    group.finish();
 }

 criterion_group!(group_1, bench_from_captest_env);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,6 +693,11 @@ impl PageServerConf {
        Ok(t_conf)
    }

+    #[cfg(test)]
+    pub fn test_repo_dir(test_name: &str) -> PathBuf {
+        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    }
+
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");

--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -59,7 +59,7 @@ pub async fn collect_metrics(
        None,
        None,
        "synthetic size calculation",
-        true,
+        false,
        async move {
            calculate_synthetic_size_worker(synthetic_size_calculation_interval)
                .instrument(info_span!("synthetic_size_worker"))
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
        schema:
          type: string
          format: hex
+      - name: inputs_only
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
    get:
      description: |
        Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
                    format: hex
                  size:
                    type: integer
+                    nullable: true
                    description: |
-                      Size metric in bytes.
+                      Size metric in bytes or null if inputs_only=true was given.
        "401":
          description: Unauthorized Error
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -239,11 +239,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
    request
        .uri()
        .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(p, _)| p == param)
-        })
+        .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
        .unwrap_or(false)
 }

@@ -252,13 +248,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
        Err(ApiError::BadRequest(anyhow!("empty query in request"))),
        |v| {
            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
                .find(|(k, _)| k == param_name)
                .map_or(
                    Err(ApiError::BadRequest(anyhow!(
                        "no {param_name} specified in query parameters"
                    ))),
-                    |(_, v)| Ok(v),
+                    |(_, v)| Ok(v.into_owned()),
                )
        },
    )
@@ -282,7 +277,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
            .await
-            .context("Failed to get local timeline info: {e:#}")
+            .context("get local timeline info")
            .map_err(ApiError::InternalServerError)?;

        Ok::<_, ApiError>(timeline_info)
@@ -453,21 +448,39 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    json_response(StatusCode::OK, tenant_info)
 }

+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let inputs_only = if query_param_present(&request, "inputs_only") {
+        get_query_param(&request, "inputs_only")?
+            .parse()
+            .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+    } else {
+        false
+    };
+
    let tenant = mgr::get_tenant(tenant_id, true)
        .await
        .map_err(ApiError::InternalServerError)?;

-    // this can be long operation, it currently is not backed by any request coalescing or similar
+    // this can be long operation
    let inputs = tenant
        .gather_size_inputs()
        .await
        .map_err(ApiError::InternalServerError)?;

-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+    let size = if !inputs_only {
+        Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+    } else {
+        None
+    };

    /// Private response type with the additional "unstable" `inputs` field.
    ///
@@ -479,7 +492,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
+        ///
+        /// Will be none if `?inputs_only=true` was given.
+        size: Option<u64>,
        inputs: crate::tenant::size::ModelInputs,
    }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -488,7 +488,7 @@ impl Timeline {
                let mut buf = self
                    .get(relsize_key, lsn)
                    .await
-                    .context("read relation size of {rel:?}")?;
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                let relsize = buf.get_u32_le();

                total_size += relsize as u64;
@@ -1405,15 +1405,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
    Key {
        field1: 0x01,
        field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
        field5: 0,
        field6: 0,
    }..Key {
        field1: 0x01,
        field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
        field5: 1,
        field6: 0,
    }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
            | self.field6 as i128
    }

+    pub fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
    pub fn next(&self) -> Key {
        self.add(1)
    }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -183,12 +183,29 @@ pub enum TaskKind {
    // associated with one later, after receiving a command from the client.
    PageRequestHandler,

-    // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from storage_broker, decides which safekeeper to connect to. It spawns a
-    // separate WalReceiverConnection task to handle each connection.
+    /// Manages the WAL receiver connection for one timeline.
+    /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+    /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+    /// There is at most one connection at any given time.
+    ///
+    /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+    /// The `Client` object is what library users use to make requests & get responses.
+    /// Internally, `Client` hands over requests to the `Connection` object.
+    /// The `Connection` object is responsible for speaking the wire protocol.
+    ///
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr` and hence, has no `TaskKind`.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+    ///
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnection`] task_mgr task that is responsible for polling
+    /// the `Connection` object.
+    /// A `CancellationToken` created by the `TaskHandle` task ensures
+    /// that the [`WalReceiverConnection`] task will cancel soon after as the `TaskHandle` is dropped.
    WalReceiverManager,

-    // Handles a connection to a safekeeper, to stream WAL to a timeline.
+    /// The task that polls the `tokio-postgres::Connection` object.
+    /// See the comment on [`WalReceiverManager`].
    WalReceiverConnection,

    // Garbage collection worker. One per tenant
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -188,7 +188,7 @@ impl UninitializedTimeline<'_> {
        mut self,
        timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
        load_layer_map: bool,
-        launch_wal_receiver: bool,
+        activate: bool,
    ) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
        let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +221,12 @@ impl UninitializedTimeline<'_> {
                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
                    )
                })?;
-                new_timeline.set_state(TimelineState::Active);
                v.insert(Arc::clone(&new_timeline));

                new_timeline.maybe_spawn_flush_loop();

-                if launch_wal_receiver {
-                    new_timeline.launch_wal_receiver();
+                if activate {
+                    new_timeline.activate();
                }
            }
        }
@@ -1462,8 +1461,7 @@ impl Tenant {
                    tasks::start_background_loops(self.tenant_id);

                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Active);
-                        timeline.launch_wal_receiver();
+                        timeline.activate();
                    }
                }
            }
@@ -1487,7 +1485,7 @@ impl Tenant {
                        .values()
                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Suspended);
+                        timeline.set_state(TimelineState::Stopping);
                    }
                }
                TenantState::Broken => {
@@ -2626,10 +2624,10 @@ where
 #[cfg(test)]
 pub mod harness {
    use bytes::{Bytes, BytesMut};
+    use once_cell::sync::Lazy;
    use once_cell::sync::OnceCell;
-    use std::sync::Arc;
+    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
    use std::{fs, path::PathBuf};
-    use tempfile::TempDir;
    use utils::logging;
    use utils::lsn::Lsn;

@@ -2661,6 +2659,8 @@ pub mod harness {
        buf.freeze()
    }

+    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
+
    impl From<TenantConf> for TenantConfOpt {
        fn from(tenant_conf: TenantConf) -> Self {
            Self {
@@ -2681,31 +2681,42 @@ pub mod harness {
        }
    }

-    /// The harness saves some boilerplate and provides a way to create functional tenant
-    /// without running pageserver binary. It uses temporary directory to store data in it.
-    /// Tempdir gets removed on harness drop.
-    pub struct TenantHarness {
-        // keep the struct to not to remove tmp dir during the test
-        _temp_repo_dir: TempDir,
+    pub struct TenantHarness<'a> {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
+
+        pub lock_guard: (
+            Option<RwLockReadGuard<'a, ()>>,
+            Option<RwLockWriteGuard<'a, ()>>,
+        ),
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();

-    impl TenantHarness {
-        pub fn new() -> anyhow::Result<Self> {
+    impl<'a> TenantHarness<'a> {
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, false)
+        }
+        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, true)
+        }
+        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
+            let lock_guard = if exclusive {
+                (None, Some(LOCK.write().unwrap()))
+            } else {
+                (Some(LOCK.read().unwrap()), None)
+            };
+
            LOG_HANDLE.get_or_init(|| {
                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
            });

-            let temp_repo_dir = tempfile::tempdir()?;
-            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
-            // so far it's enough to take care of concurrently running tests.
-            let repo_dir = temp_repo_dir.path();
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;

-            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
+            let conf = PageServerConf::dummy_conf(repo_dir);
            // Make a static copy of the config. This can never be free'd, but that's
            // OK in a test.
            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2723,10 +2734,10 @@ pub mod harness {
            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

            Ok(Self {
-                _temp_repo_dir: temp_repo_dir,
                conf,
                tenant_conf,
                tenant_id,
+                lock_guard,
            })
        }

@@ -2820,8 +2831,7 @@ mod tests {

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_basic")?.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -2854,8 +2864,9 @@ mod tests {

    #[tokio::test]
    async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("no_duplicate_timelines")?
+            .load()
+            .await;
        let _ = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -2886,8 +2897,7 @@ mod tests {
    ///
    #[tokio::test]
    async fn test_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_branch")?.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -2984,8 +2994,10 @@ mod tests {

    #[tokio::test]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant =
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+                .load()
+                .await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3020,8 +3032,9 @@ mod tests {

    #[tokio::test]
    async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            .load()
+            .await;

        tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3070,8 +3083,9 @@ mod tests {

    #[tokio::test]
    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            .load()
+            .await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3093,8 +3107,9 @@ mod tests {
    }
    #[tokio::test]
    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+            .load()
+            .await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3125,7 +3140,8 @@ mod tests {

    #[tokio::test]
    async fn timeline_load() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load";
+        let harness = TenantHarness::create(TEST_NAME)?;
        {
            let tenant = harness.load().await;
            let tline = tenant
@@ -3144,7 +3160,8 @@ mod tests {

    #[tokio::test]
    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
+        let harness = TenantHarness::create(TEST_NAME)?;
        // create two timelines
        {
            let tenant = harness.load().await;
@@ -3182,7 +3199,8 @@ mod tests {

    #[tokio::test]
    async fn corrupt_metadata() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "corrupt_metadata";
+        let harness = TenantHarness::create(TEST_NAME)?;
        let tenant = harness.load().await;

        tenant
@@ -3223,8 +3241,7 @@ mod tests {

    #[tokio::test]
    async fn test_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_images")?.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3291,8 +3308,7 @@ mod tests {
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3336,8 +3352,7 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_random_updates")?.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3410,8 +3425,9 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_branches() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_branches")?
+            .load()
+            .await;
        let mut tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
@@ -3495,8 +3511,9 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_ancestors")?
+            .load()
+            .await;
        let mut tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
        })
    }

-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
        let mut off = 0;
        while off < PAGE_SZ {
            let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
    }
 }

-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
            Ok(_) => Ok(()),
@@ -332,17 +332,25 @@ mod tests {
    use super::*;
    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
    use crate::tenant::block_io::BlockCursor;
-    use crate::tenant::harness::TenantHarness;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

-    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
-        let harness = TenantHarness::new().expect("Failed to create tenant harness");
-        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
+    fn harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-        Ok((harness, timeline_id))
+        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+
+        Ok((conf, tenant_id, timeline_id))
    }

    // Helper function to slurp contents of a file, starting at the current position,
@@ -359,10 +367,10 @@ mod tests {
    }

    #[test]
-    fn test_ephemeral_files() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;

-        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        file_a.write_all_at(b"foo", 0)?;
        assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -373,7 +381,7 @@ mod tests {
        // Open a lot of files, enough to cause some page evictions.
        let mut efiles = Vec::new();
        for fileno in 0..100 {
-            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
            efiles.push((fileno, efile));
@@ -390,10 +398,10 @@ mod tests {
    }

    #[test]
-    fn test_ephemeral_blobs() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo")?;
        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+    // TODO I use i128 and u64 because it was easy for prototyping,
+    //      testing, and benchmarking. If we can use the Lsn and Key
+    //      types without overhead that would be preferable.
+    pub key: Range<i128>,
+    pub lsn: Range<u64>,
+    pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for LayerKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // NOTE we really care about comparing by lsn.start first
+        self.lsn
+            .start
+            .cmp(&other.lsn.start)
+            .then(self.lsn.end.cmp(&other.lsn.end))
+            .then(self.key.start.cmp(&other.key.start))
+            .then(self.key.end.cmp(&other.key.end))
+            .then(self.is_image.cmp(&other.is_image))
+    }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage<Value> {
+    /// The latest state
+    head: LayerCoverageTuple<Value>,
+
+    /// All previous states
+    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
+}
+
+impl<T: Clone> Default for HistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> HistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            head: LayerCoverageTuple::default(),
+            historic: BTreeMap::default(),
+        }
+    }
+
+    /// Add a layer
+    ///
+    /// Panics if new layer has older lsn.start than an existing layer.
+    /// See BufferedHistoricLayerCoverage for a more general insertion method.
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        // It's only a persistent map, not a retroactive one
+        if let Some(last_entry) = self.historic.iter().next_back() {
+            let last_lsn = last_entry.0;
+            if layer_key.lsn.start < *last_lsn {
+                panic!("unexpected retroactive insert");
+            }
+        }
+
+        // Insert into data structure
+        if layer_key.is_image {
+            self.head
+                .image_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        } else {
+            self.head
+                .delta_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        }
+
+        // Remember history. Clone is O(1)
+        self.historic.insert(layer_key.lsn.start, self.head.clone());
+    }
+
+    /// Query at a particular LSN, inclusive
+    pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
+        match self.historic.range(..=lsn).next_back() {
+            Some((_, v)) => Some(v),
+            None => None,
+        }
+    }
+
+    /// Remove all entries after a certain LSN (inclusive)
+    pub fn trim(&mut self, begin: &u64) {
+        self.historic.split_off(begin);
+        self.head = self
+            .historic
+            .iter()
+            .rev()
+            .next()
+            .map(|(_, v)| v.clone())
+            .unwrap_or_default();
+    }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // After Layer 1 insertion
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // After Layer 2 insertion
+    let version = map.get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(11), None);
+
+    // After Layer 3 insertion
+    let version = map.get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+
+    // Check different LSNs
+    let version = map.get_version(99);
+    assert!(version.is_none());
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    let version = map.get_version(110).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // Check different keys
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 10".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..8,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 11".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..4,
+            lsn: 200..210,
+            is_image: true,
+        },
+        "Layer 20".to_string(),
+    );
+
+    // Check after layer 11
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 10".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+
+    // Check after layer 20
+    let version = map.get_version(205).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 20".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    // Add 3 key-disjoint layers with varying LSN ranges
+    map.insert(
+        LayerKey {
+            key: 1..2,
+            lsn: 100..200,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..5,
+            lsn: 110..200,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 7..8,
+            lsn: 120..300,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // Add wide and short layer
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 130..199,
+            is_image: true,
+        },
+        "Layer 4".to_string(),
+    );
+
+    // Add wide layer taller than some
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 140..201,
+            is_image: true,
+        },
+        "Layer 5".to_string(),
+    );
+
+    // Add wide layer taller than all
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 150..301,
+            is_image: true,
+        },
+        "Layer 6".to_string(),
+    );
+
+    // After layer 4 insertion
+    let version = map.get_version(135).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+    // After layer 5 insertion
+    let version = map.get_version(145).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+    // After layer 6 insertion
+    let version = map.get_version(155).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+///      for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage<Value> {
+    /// A persistent layer map that we rebuild when we need to retroactively update
+    historic_coverage: HistoricLayerCoverage<Value>,
+
+    /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
+    buffer: BTreeMap<LayerKey, Option<Value>>,
+
+    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    layers: BTreeMap<LayerKey, Value>,
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RetroactiveLayerMap")
+            .field("buffer", &self.buffer)
+            .field("layers", &self.layers)
+            .finish()
+    }
+}
+
+impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            historic_coverage: HistoricLayerCoverage::<Value>::new(),
+            buffer: BTreeMap::new(),
+            layers: BTreeMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        self.buffer.insert(layer_key, Some(value));
+    }
+
+    pub fn remove(&mut self, layer_key: LayerKey) {
+        self.buffer.insert(layer_key, None);
+    }
+
+    pub fn rebuild(&mut self) {
+        // Find the first LSN that needs to be rebuilt
+        let rebuild_since: u64 = match self.buffer.iter().next() {
+            Some((LayerKey { lsn, .. }, _)) => lsn.start,
+            None => return, // No need to rebuild if buffer is empty
+        };
+
+        // Apply buffered updates to self.layers
+        let num_updates = self.buffer.len();
+        self.buffer.retain(|layer_key, layer| {
+            match layer {
+                Some(l) => {
+                    self.layers.insert(layer_key.clone(), l.clone());
+                }
+                None => {
+                    self.layers.remove(layer_key);
+                }
+            };
+            false
+        });
+
+        // Rebuild
+        let mut num_inserted = 0;
+        self.historic_coverage.trim(&rebuild_since);
+        for (layer_key, layer) in self.layers.range(
+            LayerKey {
+                lsn: rebuild_since..0,
+                key: 0..0,
+                is_image: false,
+            }..,
+        ) {
+            self.historic_coverage
+                .insert(layer_key.clone(), layer.clone());
+            num_inserted += 1;
+        }
+
+        // TODO maybe only warn if ratio is at least 10
+        info!(
+            "Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
+            num_inserted, num_updates,
+        )
+    }
+
+    /// Iterate all the layers
+    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+        // NOTE we can actually perform this without rebuilding,
+        //      but it's not necessary for now.
+        if !self.buffer.is_empty() {
+            panic!("rebuild pls")
+        }
+
+        self.layers.values().cloned()
+    }
+
+    /// Return a reference to a queryable map, assuming all updates
+    /// have already been processed using self.rebuild()
+    pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
+        // NOTE we error here instead of implicitly rebuilding because
+        //      rebuilding is somewhat expensive.
+        // TODO maybe implicitly rebuild and log/sentry an error?
+        if !self.buffer.is_empty() {
+            anyhow::bail!("rebuild required")
+        }
+
+        Ok(&self.historic_coverage)
+    }
+}
+
+#[test]
+fn test_retroactive_regression_1() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    map.insert(
+        LayerKey {
+            key: 0..21267647932558653966460912964485513215,
+            lsn: 23761336..23761457,
+            is_image: false,
+        },
+        "sdfsdfs".to_string(),
+    );
+
+    map.rebuild();
+
+    let version = map.get().unwrap().get_version(23761457).unwrap();
+    assert_eq!(
+        version.delta_coverage.query(100),
+        Some("sdfsdfs".to_string())
+    );
+}
+
+#[test]
+fn test_retroactive_simple() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    // Append some images in increasing LSN order
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Image 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Image 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 3".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 8..9,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 4".to_string(),
+    );
+
+    // Add a delta layer out of order
+    map.insert(
+        LayerKey {
+            key: 2..5,
+            lsn: 105..106,
+            is_image: true,
+        },
+        "Delta 1".to_string(),
+    );
+
+    // Rebuild so we can start querying
+    map.rebuild();
+
+    // Query key 4
+    let version = map.get().unwrap().get_version(90);
+    assert!(version.is_none());
+    let version = map.get().unwrap().get_version(102).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
+    let version = map.get().unwrap().get_version(107).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
+    let version = map.get().unwrap().get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
+
+    // Remove Image 3
+    map.remove(LayerKey {
+        key: 4..6,
+        lsn: 120..121,
+        is_image: true,
+    });
+    map.rebuild();
+
+    // Check deletion worked
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
+}
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -0,0 +1,229 @@
+use std::ops::Range;
+
+use im::OrdMap;
+use rpds::RedBlackTreeMapSync;
+
+/// Data structure that can efficiently:
+/// - find the latest layer by lsn.end at a given key
+/// - iterate the latest layers in a key range
+/// - insert layers in non-decreasing lsn.start order
+///
+/// The struct is parameterized over Value for easier
+/// testing, but in practice it's some sort of layer.
+pub struct LayerCoverage<Value> {
+    /// For every change in coverage (as we sweep the key space)
+    /// we store (lsn.end, value).
+    ///
+    /// We use an immutable/persistent tree so that we can keep historic
+    /// versions of this coverage without cloning the whole thing and
+    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    ///
+    /// We use the Sync version of the map because we want Self to
+    /// be Sync. Using nonsync might be faster, if we can work with
+    /// that.
+    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
+    im_nodes: OrdMap<i128, Option<(u64, Value)>>,
+}
+
+impl<T: Clone> Default for LayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> LayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            nodes: RedBlackTreeMapSync::default(),
+            im_nodes: OrdMap::default(),
+        }
+    }
+
+    /// Helper function to subdivide the key range without changing any values
+    ///
+    /// Complexity: O(log N)
+    fn add_node(&mut self, key: i128) {
+        let value = match self.nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.nodes.insert_mut(key, value);
+
+        let im_value = match self.im_nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.im_nodes.remove(&key);
+        self.im_nodes.insert(key, im_value);
+    }
+
+    /// Insert a layer.
+    ///
+    /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
+    pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
+        // Add nodes at endpoints
+        //
+        // NOTE The order of lines is important. We add nodes at the start
+        // and end of the key range **before updating any nodes** in order
+        // to pin down the current coverage outside of the relevant key range.
+        // Only the coverage inside the layer's key range should change.
+        self.add_node(key.start);
+        self.add_node(key.end);
+
+        // Raise the height where necessary
+        //
+        // NOTE This loop is worst case O(N), but amortized O(log N) in the special
+        // case when rectangles have no height. In practice I don't think we'll see
+        // the kind of layer intersections needed to trigger O(N) behavior. The worst
+        // case is N/2 horizontal layers overlapped with N/2 vertical layers in a
+        // grid pattern.
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.nodes.remove_mut(&k);
+        }
+
+
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.im_nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.im_nodes.remove(&k);
+            self.im_nodes.insert(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.im_nodes.remove(&k);
+        }
+
+    }
+
+    fn get_key_1(&self, key: i128) -> Option<u64> {
+        self.im_nodes
+            .get_prev(&key)?
+            .1
+            .as_ref()
+            .map(|(k, _)| k.clone())
+    }
+    fn get_key_2(&self, key: i128) -> Option<u64> {
+        self.im_nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(k, _)| k.clone())
+    }
+
+    /// Get the latest (by lsn.end) layer at a given key
+    ///
+    /// Complexity: O(log N)
+    pub fn query(&self, key: i128) -> Option<Value> {
+
+        let k1 = self.get_key_1(key);
+        let k2 = self.get_key_2(key);
+        assert_eq!(k1, k2);
+
+
+        // self.im_nodes
+        //     .get_prev(&key)?
+        //     .1
+        //     .as_ref()
+        //     .map(|(_, v)| v.clone())
+
+        self.im_nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(_, v)| v.clone())
+
+        // self.nodes
+        //     .range(..=key)
+        //     .rev()
+        //     .next()?
+        //     .1
+        //     .as_ref()
+        //     .map(|(_, v)| v.clone())
+    }
+
+    /// Iterate the changes in layer coverage in a given range. You will likely
+    /// want to start with self.query(key.start), and then follow up with self.range
+    ///
+    /// Complexity: O(log N + result_size)
+    pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
+        self.nodes
+            .range(key)
+            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
+    }
+
+    /// O(1) clone
+    pub fn clone(&self) -> Self {
+        Self {
+            nodes: self.nodes.clone(),
+            im_nodes: self.im_nodes.clone(),
+        }
+    }
+}
+
+/// Image and delta coverage at a specific LSN.
+pub struct LayerCoverageTuple<Value> {
+    pub image_coverage: LayerCoverage<Value>,
+    pub delta_coverage: LayerCoverage<Value>,
+}
+
+impl<T: Clone> Default for LayerCoverageTuple<T> {
+    fn default() -> Self {
+        Self {
+            image_coverage: LayerCoverage::default(),
+            delta_coverage: LayerCoverage::default(),
+        }
+    }
+}
+
+impl<Value: Clone> LayerCoverageTuple<Value> {
+    pub fn clone(&self) -> Self {
+        Self {
+            image_coverage: self.image_coverage.clone(),
+            delta_coverage: self.delta_coverage.clone(),
+        }
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
    // Test scheduling
    #[test]
    fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("upload_scheduling")?;
        let timeline_path = harness.timeline_path(&TIMELINE_ID);
        std::fs::create_dir_all(&timeline_path)?;

--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -23,7 +23,13 @@ use tracing::*;
 pub struct ModelInputs {
    updates: Vec<Update>,
    retention_period: u64,
+
+    /// Relevant lsns per timeline.
+    ///
+    /// This field is not required for deserialization purposes, which is mostly used in tests. The
+    /// LSNs explain the outcome (updates) but are not needed in size calculation.
    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    #[serde(default)]
    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
 }

@@ -32,6 +38,8 @@ pub struct ModelInputs {
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    ancestor_lsn: Lsn,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    last_record: Lsn,
    #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -178,19 +186,20 @@ pub(super) async fn gather_inputs(
    // our advantage with `?` error handling.
    let mut joinset = tokio::task::JoinSet::new();

-    let timelines = tenant
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    tenant
        .refresh_gc_info()
        .await
        .context("Failed to refresh gc_info before gathering inputs")?;

+    let timelines = tenant.list_timelines();
+
    if timelines.is_empty() {
-        // All timelines are below tenant's gc_horizon; alternative would be to use
-        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
-        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
+        // perhaps the tenant has just been created, and as such doesn't have any data yet
        return Ok(ModelInputs {
            updates: vec![],
            retention_period: 0,
-            timeline_inputs: HashMap::new(),
+            timeline_inputs: HashMap::default(),
        });
    }

@@ -201,13 +210,25 @@ pub(super) async fn gather_inputs(

    let mut updates = Vec::new();

-    // record the per timline values used to determine `retention_period`
+    // record the per timeline values useful to debug the model inputs, also used to track
+    // ancestor_lsn without keeping a hold of Timeline
    let mut timeline_inputs = HashMap::with_capacity(timelines.len());

    // used to determine the `retention_period` for the size model
    let mut max_cutoff_distance = None;

+    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
+    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
+    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
+
    for timeline in timelines {
+        if !timeline.is_active() {
+            anyhow::bail!(
+                "timeline {} is not active, cannot calculate tenant_size now",
+                timeline.timeline_id
+            );
+        }
+
        let last_record_lsn = timeline.get_last_record_lsn();

        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
@@ -273,13 +294,30 @@ pub(super) async fn gather_inputs(

        // all timelines branch from something, because it might be impossible to pinpoint
        // which is the tenant_size_model's "default" branch.
+
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
        updates.push(Update {
-            lsn: timeline.get_ancestor_lsn(),
+            lsn: ancestor_lsn,
            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
            timeline_id: timeline.timeline_id,
        });

+        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
+            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
+            // which are over gc_horizon. for example, a "main" branch which never received any
+            // updates apart from initdb not have branch points recorded.
+            referenced_branch_froms
+                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
+                .or_default();
+        }
+
        for (lsn, _kind) in &interesting_lsns {
+            // mark this visited so don't need to re-process this parent
+            *referenced_branch_froms
+                .entry((timeline.timeline_id, *lsn))
+                .or_default() = true;
+
            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
                updates.push(Update {
                    lsn: *lsn,
@@ -295,22 +333,10 @@ pub(super) async fn gather_inputs(
            }
        }

-        // all timelines also have an end point if they have made any progress
-        if last_record_lsn > timeline.get_ancestor_lsn()
-            && !interesting_lsns
-                .iter()
-                .any(|(lsn, _)| lsn == &last_record_lsn)
-        {
-            updates.push(Update {
-                lsn: last_record_lsn,
-                command: Command::EndOfBranch,
-                timeline_id: timeline.timeline_id,
-            });
-        }
-
        timeline_inputs.insert(
            timeline.timeline_id,
            TimelineInputs {
+                ancestor_lsn,
                last_record: last_record_lsn,
                // this is not used above, because it might not have updated recently enough
                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
@@ -321,6 +347,80 @@ pub(super) async fn gather_inputs(
        );
    }

+    // iterate over discovered branch points and make sure we are getting logical sizes at those
+    // points.
+    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
+        if *handled {
+            continue;
+        }
+
+        let timeline_id = *timeline_id;
+        let lsn = *lsn;
+
+        match timeline_inputs.get(&timeline_id) {
+            Some(inputs) if inputs.ancestor_lsn == lsn => {
+                // we don't need an update at this branch point which is also point where
+                // timeline_id branch was branched from.
+                continue;
+            }
+            Some(_) => {}
+            None => {
+                // we should have this because we have iterated through all of the timelines
+                anyhow::bail!("missing timeline_input for {timeline_id}")
+            }
+        }
+
+        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
+            updates.push(Update {
+                lsn,
+                timeline_id,
+                command: Command::Update(*size),
+            });
+
+            needed_cache.insert((timeline_id, lsn));
+        } else {
+            let timeline = tenant
+                .get_timeline(timeline_id, false)
+                .context("find referenced ancestor timeline")?;
+            let parallel_size_calcs = Arc::clone(limit);
+            joinset.spawn(calculate_logical_size(
+                parallel_size_calcs,
+                timeline.clone(),
+                lsn,
+            ));
+
+            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
+                // we should not find new ones because we iterated tenants all timelines
+                anyhow::ensure!(
+                    timeline_inputs.contains_key(&parent_id),
+                    "discovered new timeline {parent_id} (parent of {timeline_id})"
+                );
+            }
+        };
+    }
+
+    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
+    // point. this is needed by the model.
+    for (timeline_id, inputs) in timeline_inputs.iter() {
+        let lsn = inputs.last_record;
+
+        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
+            // this means that the (timeline_id, last_record_lsn) represents a branch point
+            // we do not want to add EndOfBranch updates for these points because it doesn't fit
+            // into the current tenant_size_model.
+            continue;
+        }
+
+        if lsn > inputs.ancestor_lsn {
+            // all timelines also have an end point if they have made any progress
+            updates.push(Update {
+                lsn,
+                command: Command::EndOfBranch,
+                timeline_id: *timeline_id,
+            });
+        }
+    }
+
    let mut have_any_error = false;

    while let Some(res) = joinset.join_next().await {
@@ -379,6 +479,7 @@ pub(super) async fn gather_inputs(
    // handled by the variant order in `Command`.
    //
    updates.sort_unstable();
+
    // And another sort to handle Command::BranchFrom ordering
    // in case when there are multiple branches at the same LSN.
    let sorted_updates = sort_updates_in_tree_order(updates)?;
@@ -413,10 +514,10 @@ impl ModelInputs {
            let Lsn(now) = *lsn;
            match op {
                Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
                }
                Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
                }
                Command::BranchFrom(parent) => {
                    // This branch command may fail if it cannot find a parent to branch from.
@@ -425,7 +526,7 @@ impl ModelInputs {
            }
        }

-        Ok(storage.calculate(self.retention_period).total_children())
+        Ok(storage.calculate(self.retention_period)?.total_children())
    }
 }

@@ -574,7 +675,10 @@ fn updates_sort() {
 fn verify_size_for_multiple_branches() {
    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
    // it has the stable lsn's
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+    //
+    // timelineinputs have been left out, because those explain the inputs, but don't participate
+    // in further size calculations.
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;

    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
        None
    }
 }
+
+impl std::fmt::Debug for dyn Layer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Layer")
+            .field("short_id", &self.short_id())
+            .finish()
+    }
+}
+
+/// Holds metadata about a layer without any content. Used mostly for testing.
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
+    }
+
+    fn short_id(&self) -> String {
+        self.short_id.clone()
+    }
+
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        todo!()
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -729,16 +729,24 @@ impl Timeline {
        Ok(())
    }

+    pub fn activate(self: &Arc<Self>) {
+        self.set_state(TimelineState::Active);
+        self.launch_wal_receiver();
+    }
+
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (st, TimelineState::Loading) => {
+                error!("ignoring transition from {st:?} into Loading state");
            }
            (TimelineState::Broken, _) => {
                error!("Ignoring state update {new_state:?} for broken tenant");
            }
            (TimelineState::Stopping, TimelineState::Active) => {
-                debug!("Not activating a Stopping timeline");
+                error!("Not activating a Stopping timeline");
            }
            (_, new_state) => {
                self.state.send_replace(new_state);
@@ -812,7 +820,7 @@ impl Timeline {
        pg_version: u32,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
+        let (state, _) = watch::channel(TimelineState::Loading);

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -970,6 +978,7 @@ impl Timeline {
    ///
    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
        let mut num_layers = 0;

        let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1010,7 +1019,7 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                num_layers += 1;
            } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                // Create a DeltaLayer struct for each delta file.
@@ -1041,7 +1050,7 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                num_layers += 1;
            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                // ignore these
@@ -1067,6 +1076,7 @@ impl Timeline {
            }
        }

+        updates.flush();
        layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);

        info!(
@@ -1091,6 +1101,11 @@ impl Timeline {
        // Are we missing some files that are present in remote storage?
        // Create RemoteLayer instances for them.
        let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut layer_map = self.layers.write().unwrap();
+        let mut updates = layer_map.batch_update();
        for remote_layer_name in &index_part.timeline_layers {
            let local_layer = local_only_layers.remove(remote_layer_name);

@@ -1129,7 +1144,7 @@ impl Timeline {
                            anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                        } else {
                            self.metrics.resident_physical_size_gauge.sub(local_size);
-                            self.layers.write().unwrap().remove_historic(local_layer);
+                            updates.remove_historic(local_layer);
                            // fall-through to adding the remote layer
                        }
                    } else {
@@ -1171,7 +1186,7 @@ impl Timeline {
                    );
                    let remote_layer = Arc::new(remote_layer);

-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                }
                LayerFileName::Delta(deltafilename) => {
                    // Create a RemoteLayer for the delta file.
@@ -1194,13 +1209,14 @@ impl Timeline {
                        &remote_layer_metadata,
                    );
                    let remote_layer = Arc::new(remote_layer);
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                }
                #[cfg(test)]
                LayerFileName::Test(_) => unreachable!(),
            }
        }

+        updates.flush();
        Ok(local_only_layers)
    }

@@ -1392,7 +1408,7 @@ impl Timeline {
                            TimelineState::Active => continue,
                            TimelineState::Broken
                            | TimelineState::Stopping
-                            | TimelineState::Suspended => {
+                            | TimelineState::Loading => {
                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
                            }
                        }
@@ -2099,10 +2115,11 @@ impl Timeline {
        ])?;

        // Add it to the layer map
-        {
-            let mut layers = self.layers.write().unwrap();
-            layers.insert_historic(Arc::new(new_delta));
-        }
+        self.layers
+            .write()
+            .unwrap()
+            .batch_update()
+            .insert_historic(Arc::new(new_delta));

        // update the timeline's physical size
        let sz = new_delta_path.metadata()?.len();
@@ -2166,13 +2183,15 @@ impl Timeline {
                // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                if img_lsn < lsn {
-                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
+                    let threshold = self.get_image_creation_threshold();
+                    let num_deltas =
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;

                    debug!(
                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
                    );
-                    if num_deltas >= self.get_image_creation_threshold() {
+                    if num_deltas >= threshold {
                        return Ok(true);
                    }
                }
@@ -2267,21 +2286,23 @@ impl Timeline {
        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());

        let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
        for l in image_layers {
            let path = l.filename();
            let metadata = timeline_path
                .join(path.file_name())
                .metadata()
-                .context("reading metadata of layer file {path}")?;
+                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;

            layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));

            self.metrics
                .resident_physical_size_gauge
                .add(metadata.len());
-            layers.insert_historic(Arc::new(l));
+            updates.insert_historic(Arc::new(l));
        }
+        updates.flush();
        drop(layers);
        timer.stop_and_record();

@@ -2577,6 +2598,7 @@ impl Timeline {
        }

        let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
        for l in new_layers {
            let new_delta_path = l.path();
@@ -2597,7 +2619,7 @@ impl Timeline {

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            layers.insert_historic(x);
+            updates.insert_historic(x);
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
@@ -2611,8 +2633,9 @@ impl Timeline {
            }
            layer_names_to_delete.push(l.filename());
            l.delete()?;
-            layers.remove_historic(l);
+            updates.remove_historic(l);
        }
+        updates.flush();
        drop(layers);

        // Also schedule the deletions in remote storage
@@ -2812,6 +2835,7 @@ impl Timeline {
        // 3. it doesn't need to be retained for 'retain_lsns';
        // 4. newer on-disk image layers cover the layer's whole key range
        //
+        // TODO holding a write lock is too agressive and avoidable
        let mut layers = self.layers.write().unwrap();
        'outer: for l in layers.iter_historic_layers() {
            result.layers_total += 1;
@@ -2843,6 +2867,8 @@ impl Timeline {
            // might be referenced by child branches forever.
            // We can track this in child timeline GC and delete parent layers when
            // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
            for retain_lsn in &retain_lsns {
                // start_lsn is inclusive
                if &l.get_lsn_range().start <= retain_lsn {
@@ -2896,6 +2922,7 @@ impl Timeline {
            layers_to_remove.push(Arc::clone(&l));
        }

+        let mut updates = layers.batch_update();
        if !layers_to_remove.is_empty() {
            // Persist the new GC cutoff value in the metadata file, before
            // we actually remove anything.
@@ -2913,7 +2940,13 @@ impl Timeline {
                }
                layer_names_to_delete.push(doomed_layer.filename());
                doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
-                layers.remove_historic(doomed_layer);
+
+                // TODO Removing from the bottom of the layer map is expensive.
+                //      Maybe instead discard all layer map historic versions that
+                //      won't be needed for page reconstruction for this timeline,
+                //      and mark what we can't delete yet as deleted from the layer
+                //      map index without actually rebuilding the index.
+                updates.remove_historic(doomed_layer);
                result.layers_removed += 1;
            }

@@ -2925,6 +2958,7 @@ impl Timeline {
                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
            }
        }
+        updates.flush();

        info!(
            "GC completed removing {} layers, cutoff {}",
@@ -3081,11 +3115,13 @@ impl Timeline {
                    // Delta- or ImageLayer in the layer map.
                    let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                    let mut layers = self_clone.layers.write().unwrap();
+                    let mut updates = layers.batch_update();
                    {
                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        layers.remove_historic(l);
+                        updates.remove_historic(l);
                    }
-                    layers.insert_historic(new_layer);
+                    updates.insert_historic(new_layer);
+                    updates.flush();
                    drop(layers);

                    // Now that we've inserted the download into the layer map,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,13 +525,12 @@ mod tests {
        })
    }

-    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
    where
        FD: Read + Write + Seek + FileExt,
        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
    {
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join(test_name);
+        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
        std::fs::create_dir_all(&testdir)?;

        let path_a = testdir.join("file_a");
@@ -633,8 +632,7 @@ mod tests {
        const THREADS: usize = 100;
        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];

-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join("vfile_concurrency");
+        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
        std::fs::create_dir_all(&testdir)?;

        // Create a test file.
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,8 +1146,7 @@ mod tests {

    #[tokio::test]
    async fn test_relsize() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_relsize")?.load().await;
        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
        let mut walingest = init_walingest_test(&tline).await?;

@@ -1324,8 +1323,7 @@ mod tests {
    // and then created it again within the same layer.
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
        let mut walingest = init_walingest_test(&tline).await?;

@@ -1378,8 +1376,7 @@ mod tests {
    // and then extended it again within the same layer.
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
        let mut walingest = init_walingest_test(&tline).await?;

@@ -1500,8 +1497,7 @@ mod tests {
    /// split into multiple 1 GB segments in Postgres.
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_large_rel")?.load().await;
        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
        let mut walingest = init_walingest_test(&tline).await?;

--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -183,13 +183,23 @@ async fn connection_manager_loop_step(

            new_event = async {
                loop {
+                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                        warn!("wal connection manager should only be launched after timeline has become active");
+                    }
                    match timeline_state_updates.changed().await {
                        Ok(()) => {
                            let new_state = walreceiver_state.timeline.current_state();
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    return ControlFlow::Break(());
+                                }
+                                TimelineState::Loading => {
+                                    warn!("timeline transitioned back to Loading state, that should not happen");
+                                    return ControlFlow::Continue(new_state);
+                                }
                            }
                        }
                        Err(_sender_dropped_error) => return ControlFlow::Break(()),
@@ -197,7 +207,7 @@ async fn connection_manager_loop_step(
                }
            } => match new_event {
                ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    info!("observed timeline state change, new state is {new_state:?}");
                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
@@ -289,7 +299,9 @@ async fn subscribe_for_timeline_updates(
                return resp.into_inner();
            }
            Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                continue;
            }
        }
@@ -846,7 +858,7 @@ mod tests {

    #[tokio::test]
    async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_no_candidate")?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -879,7 +891,7 @@ mod tests {

    #[tokio::test]
    async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("connection_no_candidate")?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -942,7 +954,7 @@ mod tests {

    #[tokio::test]
    async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_candidate")?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1001,7 +1013,7 @@ mod tests {

    #[tokio::test]
    async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1041,7 +1053,7 @@ mod tests {

    #[tokio::test]
    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1105,7 +1117,7 @@ mod tests {

    #[tokio::test]
    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1166,7 +1178,7 @@ mod tests {

    #[tokio::test]
    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1244,7 @@ mod tests {

    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";

-    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
        WalreceiverState {
            id: TenantTimelineId {
                tenant_id: harness.tenant_id,
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -77,9 +77,13 @@ pub async fn handle_walreceiver_connection(
                info!("DB connection stream finished: {expected_error}");
                return Ok(());
            }
-            Err(elapsed) => anyhow::bail!(
-                "Timed out while waiting {elapsed} for walreceiver connection to open"
-            ),
+            Err(_) => {
+                // Timing out to connect to a safekeeper node could happen long time, due to
+                // many reasons that pageserver cannot control.
+                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
+                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                return Ok(());
+            }
        }
    };

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1206,6 +1206,9 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
        """
        Returns the tenant size, together with the model inputs as the second tuple item.
        """
@@ -1216,9 +1219,9 @@ class PageserverHttpClient(requests.Session):
        assert TenantId(res["id"]) == tenant_id
        size = res["size"]
        assert type(size) == int
-        # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model
-        # there are no tests for those right now.
-        return size
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)

    def timeline_list(
        self,
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,5 +1,6 @@
-from typing import List, Tuple
+from typing import Any, List, Tuple

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
@@ -9,28 +10,247 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
    env = neon_simple_env
    (tenant_id, _) = env.neon_cli.create_tenant()
    http_client = env.pageserver.http_client()
-    size = http_client.tenant_size(tenant_id)
+    initial_size = http_client.tenant_size(tenant_id)

-    # we should never have zero, because there should be the initdb however
-    # this is questionable if we should have anything in this case, as the
-    # gc_cutoff is negative
-    assert (
-        size == 0
-    ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon"
+    # we should never have zero, because there should be the initdb "changes"
+    assert initial_size > 0, "initial implementation returns ~initdb tenant_size"

-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+    main_branch_name = "main"
+
+    with env.postgres.create_start(
+        main_branch_name,
+        tenant_id=tenant_id,
+        config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
+    ) as pg:
        with pg.cursor() as cur:
            cur.execute("SELECT 1")
            row = cur.fetchone()
            assert row is not None
            assert row[0] == 1
        size = http_client.tenant_size(tenant_id)
-        assert size == 0, "starting idle compute should not change the tenant size"
+        # we've disabled the autovacuum and checkpoint
+        # so background processes should not change the size.
+        # If this test will flake we should probably loosen the check
+        assert size == initial_size, "starting idle compute should not change the tenant size"

    # the size should be the same, until we increase the size over the
    # gc_horizon
-    size = http_client.tenant_size(tenant_id)
-    assert size == 0, "tenant_size should not be affected by shutdown of compute"
+    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
+    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
+
+    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
+    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
+    assert actual_commands == expected_commands
+
+
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+    """
+    Issue found in production. Because the ancestor branch was under
+    gc_horizon, the branchpoint was "dangling" and the computation could not be
+    done.
+
+    Assuming gc_horizon = 50
+    root:    I      0---10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
+
+    with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+    log.info(f"size_after_branching: {size_after_branching}")
+
+    assert size_after_branching > initial_size
+
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+    """
+    More general version of test_branched_empty_timeline_size
+
+    Assuming gc_horizon = 50
+
+    root:  I 0------10
+    first: I        10
+    nth_0: I        10
+    nth_1: I        10
+    nth_n:          10------------I--------100
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_name = "first"
+    env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+
+    # this might be flaky like test_get_tenant_size_with_multiple_branches
+    # https://github.com/neondatabase/neon/issues/2962
+    assert size_after_branching == initial_size
+
+    last_branch_name = first_branch_name
+    last_branch = None
+
+    for i in range(0, 4):
+        latest_branch_name = f"nth_{i}"
+        last_branch = env.neon_cli.create_branch(
+            latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id
+        )
+        last_branch_name = latest_branch_name
+
+        size_after_branching = http_client.tenant_size(tenant_id)
+        assert size_after_branching == initial_size
+
+    assert last_branch is not None
+
+    with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
+
+    size_after_writes = http_client.tenant_size(tenant_id)
+    assert size_after_writes > initial_size
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 15
+
+    main:          0--I-10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 20_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_parent_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 5
+
+    main:          0----10----I->20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 200_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
+
+        wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = small
+
+    main: 0--------10-----I>20
+    first:         |-----------------------------I>150
+    second:        |---------I>30
+    """
+
+    env = neon_simple_env
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"})
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id)
+    second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id)
+
+    ids = {"main": main_id, "first": first_id, "second": second_id}
+
+    latest_size = None
+
+    # gc is not expected to change the results
+
+    for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
+        with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+            with pg.cursor() as cur:
+                cur.execute(
+                    f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
+                )
+            wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
+            size_now = http_client.tenant_size(tenant_id)
+            if latest_size is not None:
+                assert size_now > latest_size
+            else:
+                assert size_now > initial_size
+
+            latest_size = size_now


 def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,6 +2,7 @@
 /pg_regress

 # Generated subdirectories
+/tmp_check/
 /results/
 /log/

--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -31,7 +31,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128", "libm"] }
+num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
Author	SHA1	Message	Date
Bojan Serafimov	5edb0ccfa7	Add assertion	2023-01-24 15:23:27 -05:00
Bojan Serafimov	2bc1324bed	Try im (benchmarks panic)	2023-01-24 14:36:22 -05:00
Sergey Melnikov	aabca55d7e	Migrate update version to management APIv2 (#3430 )	2023-01-24 17:18:16 +01:00
Kirill Bulatov	1c3636d848	Tone down walreceiver connection timeout errors (#3425 ) Closes https://github.com/neondatabase/neon/issues/3342	2023-01-24 18:03:33 +02:00
Kirill Bulatov	0c16ad8591	Tone down broker subscription errors	2023-01-24 17:23:33 +02:00
Christian Schwarz	0b673c12d7	timeline: don't transition Active=>Active during pageserver startup Before this patch, when `initialize_with_lock` was called via `timeline_init_and_sync`, we would transition the timeline like so: load_local_timeline/load_remote_timeline: timeline_init_and_sync Timeline::new () => Loading initialize_with_lock: set_state(Active) Loading => Active timeline.activate() Active => Active	2023-01-24 15:56:02 +01:00
Christian Schwarz	7a333cfb12	be noisy about unexpected Timeline state transitions	2023-01-24 15:56:02 +01:00
Christian Schwarz	f7ec33970a	add doc comment that outlines which tokio tasks walreceiver creates	2023-01-24 15:23:48 +01:00
Joonas Koivunen	98d0a0d242	fix(http): omit needless string allocs (#3421 ) Drive-by fix noticed while #3419.	2023-01-24 14:53:39 +02:00
Joonas Koivunen	f74080cbad	feat(http): support ?inputs_only=true for tenant_size (#3419 ) this makes debugging problematic cases in the future easier, as we can just request the model inputs, use them locally to reproduce the issue with the model.	2023-01-24 13:57:13 +02:00
Christian Schwarz	55c184fcd7	fix some anyhow::Context::context calls that should use with_context(format!(...)) Noticed this while combing through some production logs.	2023-01-24 12:22:33 +01:00
Kirill Bulatov	fd18692dfb	Output coloured pageserver logs for tty stdout	2023-01-24 09:58:08 +02:00
Alexey Kondratov	a4be54d21f	[compute_ctl] Stop updating roles on each compute start (#3391 ) I noticed that `compute_ctl` updates all roles on each start, search for rows like > - web_access:[FILTERED] -> update in the compute startup log. It happens since we had an adhoc hack for md5 hashes comparison, which doesn't work with scram hashes stored in the `pg_authid`. It doesn't really hurt, as nothing changes, but we just run >= 2 extra queries on each start, so fix it.	2023-01-23 17:46:22 +01:00
Christian Schwarz	6b6570b580	remove TimelineState::Suspended, introduce TimelineState::Loading The TimelineState::Suspsended was dubious to begin with. I suppose that the intention was that timelines could transition back and forth between Active and Suspended states. But practically, the code before this patch never did that. The transitions were: () ==Timeline::new==> Suspended ====> {Active,Broken,Stopping} One exception: Tenant::set_stopping() could transition timelines like so: !Broken ==Tenant::set_stopping()==> Suspended But Tenant itself cannot transition from stopping state to any other state. Thus, this patch removes TimelineState::Suspended and introduces a new state Loading. The aforementioned transitions change as follows: - () ==Timeline::new==> Suspended ====> {Active,Broken,Stopping} + () ==Timeline::new==> Loading ==*==> {Active,Broken,Stopping} - !Broken ==Tenant::set_stopping()==> Suspended + !Broken ==Tenant::set_stopping()==> Stopping Walreceiver's connection manager loop watches TimelineState to decide whether it should retry connecting, or exit. This patch changes the loop to exit when it observes the transition into Stopping state. Walreceiver isn't supposed to be started until the timeline transitions into Active state. So, this patch also adds some warn!() messages in case this happens anyways.	2023-01-23 17:22:49 +01:00
Joonas Koivunen	7704caa3ac	More tenant size fixes (#3410 ) Small changes, but hopefully this will help with the panic detected in staging, for which we cannot get the debugging information right now (end-of-branch before branch-point).	2023-01-23 17:12:51 +02:00
Shany Pozin	a44e5eda14	Adding pageserver3 to staging (#3403 )	2023-01-23 14:08:48 +01:00
Konstantin Knizhnik	5c865f46ba	Fix slru_segment_key_range function: segno was assigned to incorrect Key field (#3354 )	2023-01-23 10:51:09 +02:00
bojanserafimov	a3d7ad2d52	Implement layer map using immutable BST (#2998 )	2023-01-20 16:10:12 -05:00
Anastasia Lubennikova	36f048d6b0	Fix tenant size orphans (#3377 ) Before only the timelines which have passed the `gc_horizon` were processed which failed with orphans at the tree_sort phase. Example input in added `test_branched_empty_timeline_size` test case. The PR changes iteration to happen through all timelines, and in addition to that, any learned branch points will be calculated as they would had been in the original implementation if the ancestor branch had been over the `gc_horizon`. This also changes how tenants where all timelines are below `gc_horizon` are handled. Previously tenant_size 0 was returned, but now they will have approximately `initdb_lsn` worth of tenant_size. The PR also adds several new tenant size tests that describe various corner cases of branching structure and `gc_horizon` setting. They are currently disabled to not consume time during CI. Co-authored-by: Joonas Koivunen <joonas@neon.tech> Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>	2023-01-20 20:21:36 +02:00
Joonas Koivunen	58fb6fe861	fix: dont stop pageserver if we fail to calculate synthetic size	2023-01-20 19:55:19 +02:00
Alexey Kondratov	20b1e26e74	[compute_ctl] Make role deletion spec processing idempotent (#3380 ) Previously, we were trying to re-assign owned objects of the already deleted role. This were causing a crash loop in the case when compute was restarted with a spec that includes delta operation for role deletion. To avoid such cases, check that role is still present before calling `reassign_owned_objects`. Resolves neondatabase/cloud#3553	2023-01-20 15:37:24 +01:00
Christian Schwarz	8ba1699937	Revert "Use actual temporary dir for pageserver unit tests" This reverts commit `826e89b9ce`. The problem with that commit was that it deletes the TempDir while there are still EphemeralFile instances open. At first I thought this could be fixed by simply adding Handle::current().block_on(task_mgr::shutdown(None, Some(tenant_id), None)) to TenantHarness::drop, but it turned out to be insufficient. So, reverting the commit until we find a proper solution. refs https://github.com/neondatabase/neon/issues/3385	2023-01-19 20:16:56 +01:00
bojanserafimov	a9bd05760f	Improve layer map docstrings (#3382 )	2023-01-19 10:29:15 -05:00