Write WAL bytes to data dir

Check bytes in segment intersection
Fix clap arg
2026-06-11 09:20:36 +00:00 · 2023-12-28 18:49:51 +00:00 · 2023-12-28 17:55:04 +00:00 · 2023-12-28 17:10:05 +00:00 · 2023-12-28 16:55:21 +00:00
181 changed files with 3859 additions and 9473 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +0,0 @@
-[profile.default]
-slow-timeout = "1m"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -105,11 +105,11 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
+      - name: Run ruff to ensure code format
+        run: poetry run ruff .

-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
+      - name: Run black to ensure code format
+        run: poetry run black --diff --check .

      - name: Run mypy to check types
        run: poetry run mypy .
@@ -339,16 +339,16 @@ jobs:
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

-      - name: Run rust tests
+      - name: Run cargo test
        run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -358,7 +358,7 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

      - name: Install rust binaries
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,8 +30,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
 "cfg-if",
- "const-random",
- "getrandom 0.2.11",
 "once_cell",
 "version_check",
 "zerocopy",
@@ -52,12 +50,6 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"

-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -255,12 +247,6 @@ dependencies = [
 "syn 2.0.32",
 ]

-[[package]]
-name = "atomic"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
-
 [[package]]
 name = "atomic-polyfill"
 version = "1.0.2"
@@ -1025,17 +1011,17 @@ dependencies = [

 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
 dependencies = [
- "android-tzdata",
 "iana-time-zone",
 "js-sys",
+ "num-integer",
 "num-traits",
 "serde",
 "wasm-bindgen",
- "windows-targets 0.48.0",
+ "winapi",
 ]

 [[package]]
@@ -1134,20 +1120,6 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"

-[[package]]
-name = "combine"
-version = "4.6.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
-dependencies = [
- "bytes",
- "futures-core",
- "memchr",
- "pin-project-lite",
- "tokio",
- "tokio-util",
-]
-
 [[package]]
 name = "comfy-table"
 version = "6.1.4"
@@ -1189,7 +1161,6 @@ dependencies = [
 "flate2",
 "futures",
 "hyper",
- "nix 0.26.2",
 "notify",
 "num_cpus",
 "opentelemetry",
@@ -1200,7 +1171,6 @@ dependencies = [
 "rust-ini",
 "serde",
 "serde_json",
- "signal-hook",
 "tar",
 "tokio",
 "tokio-postgres",
@@ -2389,6 +2359,19 @@ dependencies = [
 "tokio-native-tls",
 ]

+[[package]]
+name = "hyper-tungstenite"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
+dependencies = [
+ "hyper",
+ "pin-project-lite",
+ "tokio",
+ "tokio-tungstenite",
+ "tungstenite",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.56"
@@ -2490,12 +2473,6 @@ dependencies = [
 "web-sys",
 ]

-[[package]]
-name = "integer-encoding"
-version = "3.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
-
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@@ -2859,19 +2836,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "num"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
-dependencies = [
- "num-complex",
- "num-integer",
- "num-iter",
- "num-rational",
- "num-traits",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2883,15 +2847,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-complex"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2902,28 +2857,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-iter"
-version = "0.1.43"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-rational"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.15"
@@ -3146,15 +3079,6 @@ dependencies = [
 "tokio-stream",
 ]

-[[package]]
-name = "ordered-float"
-version = "2.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3198,7 +3122,6 @@ name = "pagebench"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "camino",
 "clap",
 "futures",
 "hdrhistogram",
@@ -3211,7 +3134,6 @@ dependencies = [
 "serde",
 "serde_json",
 "tokio",
- "tokio-util",
 "tracing",
 "utils",
 "workspace_hack",
@@ -3415,35 +3337,6 @@ dependencies = [
 "windows-targets 0.48.0",
 ]

-[[package]]
-name = "parquet"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
-dependencies = [
- "ahash",
- "bytes",
- "chrono",
- "hashbrown 0.14.0",
- "num",
- "num-bigint",
- "paste",
- "seq-macro",
- "thrift",
- "twox-hash",
- "zstd",
-]
-
-[[package]]
-name = "parquet_derive"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
-dependencies = [
- "parquet",
- "proc-macro2",
- "quote",
- "syn 2.0.32",
-]
-
 [[package]]
 name = "password-hash"
 version = "0.5.0"
@@ -3867,8 +3760,6 @@ dependencies = [
 "base64 0.13.1",
 "bstr",
 "bytes",
- "camino",
- "camino-tempfile",
 "chrono",
 "clap",
 "consumption_metrics",
@@ -3882,6 +3773,7 @@ dependencies = [
 "hostname",
 "humantime",
 "hyper",
+ "hyper-tungstenite",
 "ipnet",
 "itertools",
 "md5",
@@ -3890,8 +3782,6 @@ dependencies = [
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
- "parquet",
- "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
@@ -3901,9 +3791,7 @@ dependencies = [
 "prometheus",
 "rand 0.8.5",
 "rcgen",
- "redis",
 "regex",
- "remote_storage",
 "reqwest",
 "reqwest-middleware",
 "reqwest-retry",
@@ -3927,13 +3815,11 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls",
- "tokio-tungstenite",
 "tokio-util",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "tungstenite",
 "url",
 "utils",
 "uuid",
@@ -4066,32 +3952,6 @@ dependencies = [
 "yasna",
 ]

-[[package]]
-name = "redis"
-version = "0.24.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
-dependencies = [
- "async-trait",
- "bytes",
- "combine",
- "futures-util",
- "itoa",
- "percent-encoding",
- "pin-project-lite",
- "rustls",
- "rustls-native-certs",
- "rustls-pemfile",
- "rustls-webpki 0.101.7",
- "ryu",
- "sha1_smol",
- "socket2 0.4.9",
- "tokio",
- "tokio-rustls",
- "tokio-util",
- "url",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.2.16"
@@ -4543,14 +4403,12 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
- "aws-smithy-async",
 "bincode",
 "bytes",
 "chrono",
 "clap",
 "crc32c",
 "either",
- "futures",
 "futures-util",
 "hex",
 "histogram",
@@ -4589,7 +4447,6 @@ dependencies = [
 "clap",
 "const_format",
 "crc32c",
- "fail",
 "fs2",
 "futures",
 "git-version",
@@ -4613,7 +4470,6 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
- "sha2",
 "signal-hook",
 "storage_broker",
 "thiserror",
@@ -4820,12 +4676,6 @@ dependencies = [
 "uuid",
 ]

-[[package]]
-name = "seq-macro"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
-
 [[package]]
 name = "serde"
 version = "1.0.183"
@@ -4948,12 +4798,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "sha1_smol"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
-
 [[package]]
 name = "sha2"
 version = "0.10.6"
@@ -5352,17 +5196,6 @@ dependencies = [
 "once_cell",
 ]

-[[package]]
-name = "thrift"
-version = "0.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
-dependencies = [
- "byteorder",
- "integer-encoding",
- "ordered-float",
-]
-
 [[package]]
 name = "time"
 version = "0.3.21"
@@ -5907,16 +5740,6 @@ dependencies = [
 "utf-8",
 ]

-[[package]]
-name = "twox-hash"
-version = "1.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
-dependencies = [
- "cfg-if",
- "static_assertions",
-]
-
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6055,7 +5878,6 @@ dependencies = [
 "chrono",
 "const_format",
 "criterion",
- "fail",
 "futures",
 "heapless",
 "hex",
@@ -6094,11 +5916,10 @@ dependencies = [

 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
 dependencies = [
- "atomic",
 "getrandom 0.2.11",
 "serde",
 ]
@@ -6594,7 +6415,6 @@ dependencies = [
 "num-integer",
 "num-traits",
 "once_cell",
- "parquet",
 "prost",
 "rand 0.8.5",
 "regex",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -89,6 +89,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -106,14 +107,11 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
@@ -155,7 +153,6 @@ tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-tokio-tungstenite = "0.20"
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
@@ -163,9 +160,8 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
-tungstenite = "0.20"
 url = "2.2"
-uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
+uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
 x509-parser = "0.15"
@@ -219,16 +215,15 @@ tonic-build = "0.9"
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

-# bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-
 ################# Binary contents sections

 [profile.release]
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
+strip = true  # Automatically strip symbols from the binary.
+opt-level = "z"  # Optimize for size.
+lto = true

 # disable debug symbols for all packages except this one to decrease binaries size
 [profile.release.package."*"]
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.75.0
+ENV RUSTC_VERSION=1.74.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -151,7 +151,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-hakari && \
    cargo install cargo-deny && \
    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,7 +13,6 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -21,7 +20,6 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,22 +40,18 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
-use signal_hook::consts::{SIGQUIT, SIGTERM};
-use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -71,13 +67,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
-    thread::spawn(move || {
-        for sig in signals.forever() {
-            handle_exit_signal(sig);
-        }
-    });
-
    let build_tag = option_env!("BUILD_TAG")
        .unwrap_or(BUILD_TAG_DEFAULT)
        .to_string();
@@ -350,20 +339,13 @@ fn main() -> Result<()> {

    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    if let Some((mut pg, logs_handle)) = pg {
+    if let Some(mut pg) = pg {
        // Startup is finished, exit the startup tracing span
        drop(startup_context_guard);

        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
-
        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
    }
@@ -537,24 +519,6 @@ fn cli() -> clap::Command {
        )
 }

-/// When compute_ctl is killed, send also termination signal to sync-safekeepers
-/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
-/// wait for termination which would be easy then.
-fn handle_exit_signal(sig: i32) {
-    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
-    exit(1);
-}
-
 #[test]
 fn verify_cli() {
    cli().debug_assert()
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,8 +6,6 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::AtomicU32;
-use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Instant;
@@ -31,15 +29,11 @@ use utils::measured_stream::MeasuredReader;
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
-use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};

-pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
-pub static PG_PID: AtomicU32 = AtomicU32::new(0);
-
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    // Url type maintains proper escaping
@@ -280,7 +274,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
            $$;"#,
        roles_decl, database_decl,
    );
-    info!("Neon superuser created:\n{}", inlinify(&query));
+    info!("Neon superuser created:\n{}", &query);
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
@@ -496,7 +490,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let mut sync_handle = maybe_cgexec(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -505,29 +499,15 @@ impl ComputeNode {
                vec![]
            })
            .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
-        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);

        // `postgres --sync-safekeepers` will print all log output to stderr and
-        // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
-        // will be collected in a child thread.
-        let stderr = sync_handle
-            .stderr
-            .take()
-            .expect("stderr should be captured");
-        let logs_handle = handle_postgres_logs(stderr);
-
+        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
+        // redirected to the caller output.
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
-        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));

        if !sync_output.status.success() {
            anyhow::bail!(
@@ -665,12 +645,11 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    /// Returns a handle to the child process and a handle to the logs thread.
    #[instrument(skip_all)]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    ) -> Result<std::process::Child> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
@@ -681,18 +660,12 @@ impl ComputeNode {
            } else {
                vec![]
            })
-            .stderr(Stdio::piped())
            .spawn()
            .expect("cannot start postgres process");
-        PG_PID.store(pg.id(), Ordering::SeqCst);
-
-        // Start a thread to collect logs from stderr.
-        let stderr = pg.stderr.take().expect("stderr should be captured");
-        let logs_handle = handle_postgres_logs(stderr);

        wait_for_postgres(&mut pg, pgdata_path)?;

-        Ok((pg, logs_handle))
+        Ok(pg)
    }

    /// Do initial configuration of the already started Postgres.
@@ -837,10 +810,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-        extension_server_port: u16,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -911,7 +881,7 @@ impl ComputeNode {
        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
-        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
@@ -961,7 +931,7 @@ impl ComputeNode {
        };
        info!(?metrics, "compute start finished");

-        Ok(pg_process)
+        Ok(pg)
    }

    // Look for core dumps and collect backtraces.
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {

    Ok(())
 }
-
-/// Replace all newline characters with a special character to make it
-/// easier to grep for log messages.
-pub fn inlinify(s: &str) -> String {
-    s.replace('\n', "\u{200B}")
-}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info, warn};
+use tracing::{debug, info};

 use crate::compute::ComputeNode;

@@ -84,29 +84,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    }
                }

-                // If there are existing (logical) walsenders, do not suspend.
-                //
-                // walproposer doesn't currently show up in pg_stat_replication,
-                // but protect if it will be
-                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
-                match cli.query_one(ws_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_ws) => {
-                            if num_ws > 0 {
-                                last_active = Some(Utc::now());
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse ws count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of walsenders: {:?}", e);
-                        continue;
-                    }
-                }
-
                // Update the last activity in the shared state if we got a more recent one.
                let mut state = compute.state.lock().unwrap();
                // NB: `Some(<DateTime>)` is always greater than `None`.
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,15 +6,12 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tokio::io::AsyncBufReadExt;
-use tokio::time::timeout;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};

@@ -429,72 +426,3 @@ pub async fn tune_pgbouncer(

    Ok(())
 }
-
-/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
-/// and send them to the logger. In the future we may also want to add context to
-/// these logs.
-pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
-    std::thread::spawn(move || {
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to build tokio runtime");
-
-        let res = runtime.block_on(async move {
-            let stderr = tokio::process::ChildStderr::from_std(stderr)?;
-            handle_postgres_logs_async(stderr).await
-        });
-        if let Err(e) = res {
-            tracing::error!("error while processing postgres logs: {}", e);
-        }
-    })
-}
-
-/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
-/// - next line starts with timestamp
-/// - EOF
-/// - no new lines were written for the last second
-async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
-    let mut lines = tokio::io::BufReader::new(stderr).lines();
-    let timeout_duration = Duration::from_secs(1);
-    let ts_regex =
-        regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
-
-    let mut buf = vec![];
-    loop {
-        let next_line = timeout(timeout_duration, lines.next_line()).await;
-
-        // we should flush lines from the buffer if we cannot continue reading multiline message
-        let should_flush_buf = match next_line {
-            // Flushing if new line starts with timestamp
-            Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
-            // Flushing on EOF, timeout or error
-            _ => true,
-        };
-
-        if !buf.is_empty() && should_flush_buf {
-            // join multiline message into a single line, separated by unicode Zero Width Space.
-            // "PG:" suffix is used to distinguish postgres logs from other logs.
-            let combined = format!("PG:{}\n", buf.join("\u{200B}"));
-            buf.clear();
-
-            // sync write to stderr to avoid interleaving with other logs
-            use std::io::Write;
-            let res = std::io::stderr().lock().write_all(combined.as_bytes());
-            if let Err(e) = res {
-                tracing::error!("error while writing to stderr: {}", e);
-            }
-        }
-
-        // if not timeout, append line to the buffer
-        if next_line.is_ok() {
-            match next_line?? {
-                Some(line) => buf.push(line),
-                // EOF
-                None => break,
-            };
-        }
-    }
-
-    Ok(())
-}
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -9,7 +9,6 @@ use reqwest::StatusCode;
 use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
-use crate::logger::inlinify;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -663,11 +662,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
            $$;"
        .to_string();

-        info!(
-            "grant query for db {} : {}",
-            &db.name,
-            inlinify(&grant_query)
-        );
+        info!("grant query for db {} : {}", &db.name, &grant_query);
        db_client.simple_query(&grant_query)?;
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,11 +6,11 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
+use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::tenant_migration::migrate_tenant;
@@ -338,7 +338,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_one("force").expect("we set a default value");
+    let force = init_match.get_flag("force");
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

@@ -1266,15 +1266,9 @@ fn cli() -> Command {
        .required(false);

    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(InitForceMode))
+        .value_parser(value_parser!(bool))
        .long("force")
-        .default_value(
-            InitForceMode::MustNotExist
-                .to_possible_value()
-                .unwrap()
-                .get_name()
-                .to_owned(),
-        )
+        .action(ArgAction::SetTrue)
        .help("Force initialization even if the repository is not empty")
        .required(false);

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,8 +46,6 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
-use nix::sys::signal::kill;
-use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -441,14 +439,11 @@ impl Endpoint {
        Ok(())
    }

-    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
-        if send_sigterm {
-            kill(pid, Signal::SIGTERM).ok();
-        }
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
        Ok(())
    }
@@ -738,15 +733,10 @@ impl Endpoint {
            &None,
        )?;

-        // Also wait for the compute_ctl process to die. It might have some
-        // cleanup work to do after postgres stops, like syncing safekeepers,
-        // etc.
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
        //
-        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
-        // want this cleanup: tests intentionally do stop when majority of
-        // safekeepers is down, so sync-safekeepers would hang otherwise. This
-        // could be a separate flag though.
-        self.wait_for_compute_ctl_to_exit(destroy)?;
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
            println!(
                "Destroying postgres data directory '{}'",
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,7 +5,6 @@

 use anyhow::{bail, ensure, Context};

-use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
    }
 }

-#[derive(Clone, Copy)]
-pub enum InitForceMode {
-    MustNotExist,
-    EmptyDirOk,
-    RemoveAllContents,
-}
-
-impl ValueEnum for InitForceMode {
-    fn value_variants<'a>() -> &'a [Self] {
-        &[
-            Self::MustNotExist,
-            Self::EmptyDirOk,
-            Self::RemoveAllContents,
-        ]
-    }
-
-    fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
-        Some(clap::builder::PossibleValue::new(match self {
-            InitForceMode::MustNotExist => "must-not-exist",
-            InitForceMode::EmptyDirOk => "empty-dir-ok",
-            InitForceMode::RemoveAllContents => "remove-all-contents",
-        }))
-    }
-}
-
 impl SafekeeperConf {
    /// Compute is served by port on which only tenant scoped tokens allowed, if
    /// it is configured.
@@ -410,7 +384,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -419,34 +393,25 @@ impl LocalEnv {
        );

        if base_path.exists() {
-            match force {
-                InitForceMode::MustNotExist => {
-                    bail!(
-                        "directory '{}' already exists. Perhaps already initialized?",
-                        base_path.display()
-                    );
-                }
-                InitForceMode::EmptyDirOk => {
-                    if let Some(res) = std::fs::read_dir(base_path)?.next() {
-                        res.context("check if directory is empty")?;
-                        anyhow::bail!("directory not empty: {base_path:?}");
-                    }
-                }
-                InitForceMode::RemoveAllContents => {
-                    println!("removing all contents of '{}'", base_path.display());
-                    // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                    // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                    // S3 local SSD) to the `.neon` base directory.
-                    for entry in std::fs::read_dir(base_path)? {
-                        let entry = entry?;
-                        let path = entry.path();
-                        if path.is_dir() {
-                            fs::remove_dir_all(&path)?;
-                        } else {
-                            fs::remove_file(&path)?;
-                        }
+            if force {
+                println!("removing all contents of '{}'", base_path.display());
+                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
+                // all contents inside. This helps if the developer symbol links another directory (i.e.,
+                // S3 local SSD) to the `.neon` base directory.
+                for entry in std::fs::read_dir(base_path)? {
+                    let entry = entry?;
+                    let path = entry.path();
+                    if path.is_dir() {
+                        fs::remove_dir_all(&path)?;
+                    } else {
+                        fs::remove_file(&path)?;
                    }
                }
+            } else {
+                bail!(
+                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
+                    base_path.display()
+                );
            }
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -485,13 +485,6 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_id).await?)
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
    pub async fn timeline_create(
        &self,
        tenant_id: TenantId,
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,7 +11,6 @@ use crate::{
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
-use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -41,9 +40,9 @@ async fn await_lsn(
    loop {
        let latest = match get_lsns(tenant_id, pageserver).await {
            Ok(l) => l,
-            Err(_e) => {
+            Err(e) => {
                println!(
-                    "🕑 Waiting for pageserver {} to activate...",
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
                    pageserver.conf.id
                );
                std::thread::sleep(Duration::from_millis(500));
@@ -90,7 +89,7 @@ pub async fn migrate_tenant(
    tenant_id: TenantId,
    dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    println!("🤔 Checking existing status...");
+    // Get a new generation
    let attachment_service = AttachmentService::from_env(env);

    fn build_location_config(
@@ -136,20 +135,6 @@ pub async fn migrate_tenant(
        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
    }

-    println!(
-        "🔁 Downloading latest layers to destination pageserver {}",
-        dest_ps.conf.id
-    );
-    match dest_ps
-        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
-        .await
-    {
-        Ok(()) => {}
-        Err(_) => {
-            println!("  (skipping, destination wasn't in secondary mode)")
-        }
-    }
-
    let gen = attachment_service
        .attach_hook(tenant_id, dest_ps.conf.id)
        .await?;
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -1,142 +0,0 @@
-# Vectored Timeline Get
-
-Created on: 2024-01-02
-Author: Christian Schwarz
-
-# Summary
-
-A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
-
-# Motivation
-
-During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
-For an example, see
-https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
-
-Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
-For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
-If it's negative (no entry), we resume layer map traversal.
-If it's positive, we collect the result in our reconstruct data bag.
-If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
-Otherwise, we resume layer map traversal.
-
-Doing this many `Timeline::get` calls is quite inefficient because:
-
-1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
-2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
-   This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
-   may not contain the data we're looking for.
-3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
-   So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
-   The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
-
-[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
-
-# Solution
-
-We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API  unlocks:
-
-* more efficient basebackup
-* batched IO during compaction (useful for strides of unchanged pages)
-* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
-  * if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
-
-# DoD
-
-There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
-It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
-
-It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
-It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
-
-Functionally, the behavior of `Timeline::get_vectored` is equivalent to
-
-```rust
-let mut keys_iter: impl Iterator<Item=Key>
-  = src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
-let mut out = Vec::new();
-for key in keys_iter {
-    let data = Timeline::get(key, lsn)?;
-    out.push(data);
-}
-return out;
-```
-
-However, unlike above, an ideal solution will
-
-* Visit each `struct Layer` at most once.
-* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
-  * This means, read each `DiskBtree` page at most once.
-* Facilitate merging of the reads we issue to the OS and eventually NVMe.
-
-Each of these items above represents a signficant amount of work.
-
-## Performance
-
-Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
-A reasonable constant overhead over current `Timeline::get` is acceptable.
-
-The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
-
-# Implementation
-
-High-level set of tasks / changes to be made:
-
- **Get clarity on API**:
-  - Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
-  - The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
-  - Start with something simple to explore the different usages of the API.
-    Then iterate with peers until we have something that is good enough.
- **Vectored Layer Map traversal**
-  - Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
-  - Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
-    - The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
-      but need more.
-      Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
-  - Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
-  - Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
-  - Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
-  - What needs to happen to `DiskBtree::visit()`?
-    * Minimally
-      * take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
-      * Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
-      * This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
-    * Ideally:
-      * Take a `&[KeyVec]`, sort it;
-      * during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
-      * NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
-  - The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
-    - [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
-      - We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
-    - [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
-  - What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
-  - That is tricky because
-    - the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
-    - there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
-      - The guiding principle here should be to avoid coupling this work to the `PageCache`.
-      - I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
-
-
-Let's see how we can improve by doing the first three items in above list first, then revisit.
-
-## Rollout / Feature Flags
-
-No feature flags are required for this epic.
-
-At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
-
-It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
-That will help isolate performance regressions across weekly releases.
-
-# Interaction With Sharding
-
-[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
-
-Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
-
-Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
-
-However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
-For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `ruff`, and type hints via `mypy`.
+We force code formatting via `black`, `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run ruff format . # All code is reformatted
-poetry run ruff check .  # Python linter
-poetry run mypy .        # Ensure there are no typing errors
+poetry run black .  # All code is reformatted
+poetry run ruff .  # Python linter
+poetry run mypy .  # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -141,9 +141,8 @@ impl Key {
    }
 }

-#[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
+    key.field1 == 0x00 && key.field4 != 0
 }

 impl std::str::FromStr for Key {
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -114,21 +114,16 @@ impl KeySpaceAccum {
        }
    }

-    #[inline(always)]
    pub fn add_key(&mut self, key: Key) {
        self.add_range(singleton_range(key))
    }

-    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
        match self.accum.as_mut() {
            Some(accum) => {
                if range.start == accum.end {
                    accum.end = range.end;
                } else {
-                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
-                    // a new range here if the skipped region was all keys that don't belong on this shard.
-                    // (https://github.com/neondatabase/neon/issues/6247)
                    assert!(range.start > accum.end);
                    self.ranges.push(accum.clone());
                    *accum = range;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,7 +2,7 @@ pub mod partitioning;

 use std::{
    collections::HashMap,
-    io::{BufRead, Read},
+    io::Read,
    num::{NonZeroU64, NonZeroUsize},
    time::SystemTime,
 };
@@ -557,6 +557,19 @@ pub enum DownloadRemoteLayersTaskState {
    ShutDown,
 }

+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
@@ -813,10 +826,9 @@ impl PagestreamBeMessage {
                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
                }
                Tag::Error => {
-                    let mut msg = Vec::new();
-                    buf.read_until(0, &mut msg)?;
-                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
-                    let rust_str = cstring.to_str()?;
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
                    PagestreamBeMessage::Error(PagestreamErrorResponse {
                        message: rust_str.to_owned(),
                    })
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -422,21 +422,6 @@ impl ShardIdentity {
        }
    }

-    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
-    pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
-            false
-        } else {
-            !self.is_key_local(key)
-        }
-    }
-
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -530,7 +515,12 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,12 +35,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Query handler indicated that client should reconnect
-    #[error("Server requested reconnect")]
-    Reconnect,
-    /// Query named an entity that was not found
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                info!("Stopped due to shutdown");
                Ok(())
            }
-            Err(QueryError::Reconnect) => {
-                // Dropping out of this loop implicitly disconnects
-                info!("Stopped due to handler reconnect request");
-                Ok(())
-            }
            Err(QueryError::Disconnected(e)) => {
                info!("Disconnected ({e:#})");
                // Disconnection is not an error: we just use it that way internally to drop
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Reconnect => "reconnect".to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::NotFound(_) => "not found".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
-        QueryError::Reconnect => {
-            info!("query handler for '{query}' requested client to reconnect")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::NotFound(reason) => {
-            info!("query handler for '{query}' entity not found: {reason}")
-        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -322,12 +322,6 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(())
    }
-
-    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
-        Err(anyhow::anyhow!(
-            "copy for azure blob storage is not implemented"
-        ))
-    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -207,9 +207,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
-
-    /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -377,15 +374,6 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
-
-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
-        }
-    }
 }

 impl GenericRemoteStorage {
@@ -672,7 +660,6 @@ impl ConcurrencyLimiter {
            RequestKind::Put => &self.write,
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
-            RequestKind::Copy => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -409,20 +409,6 @@ impl RemoteStorage for LocalFs {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let from_path = from.with_base(&self.storage_root);
-        let to_path = to.with_base(&self.storage_root);
-        create_target_directory(&to_path).await?;
-        fs::copy(&from_path, &to_path).await.with_context(|| {
-            format!(
-                "Failed to copy file from '{from_path}' to '{to_path}'",
-                from_path = from_path,
-                to_path = to_path
-            )
-        })?;
-        Ok(())
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -493,38 +493,6 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
-
-        let started_at = start_measuring_requests(kind);
-
-        // we need to specify bucket_name as a prefix
-        let copy_source = format!(
-            "{}/{}",
-            self.bucket_name,
-            self.relative_path_to_s3_object(from)
-        );
-
-        let res = self
-            .client
-            .copy_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(to))
-            .copy_source(copy_source)
-            .send()
-            .await;
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
-        Ok(())
-    }
-
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
        // if prefix is none then download file `from`
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
    Put = 1,
    Delete = 2,
    List = 3,
-    Copy = 4,
 }

 use RequestKind::*;
@@ -23,7 +22,6 @@ impl RequestKind {
            Put => "put_object",
            Delete => "delete_object",
            List => "list_objects",
-            Copy => "copy_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -31,7 +29,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 4]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        // copy is equivalent to download + upload
-        self.attempt(RemoteOp::Download(from.clone()))?;
-        self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
-    }
 }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
    #[serde(default)]
    pub http_connstr: Option<String>,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineCopyRequest {
-    pub target_timeline_id: TimelineId,
-    pub until_lsn: Lsn,
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,12 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
-# which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
-
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -22,7 +16,6 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
-fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,177 +0,0 @@
-//! Failpoint support code shared between pageserver and safekeepers.
-
-use crate::http::{
-    error::ApiError,
-    json::{json_request, json_response},
-};
-use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-
-/// use with fail::cfg("$name", "return(2000)")
-///
-/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
-/// specified time (in milliseconds). The main difference is that we use async
-/// tokio sleep function. Another difference is that we print lines to the log,
-/// which can be useful in tests to check that the failpoint was hit.
-///
-/// Optionally pass a cancellation token, and this failpoint will drop out of
-/// its sleep when the cancellation token fires.  This is useful for testing
-/// cases where we would like to block something, but test its clean shutdown behavior.
-#[macro_export]
-macro_rules! __failpoint_sleep_millis_async {
-    ($name:literal) => {{
-        // If the failpoint is used with a "return" action, set should_sleep to the
-        // returned value (as string). Otherwise it's set to None.
-        let should_sleep = (|| {
-            ::fail::fail_point!($name, |x| x);
-            ::std::option::Option::None
-        })();
-
-        // Sleep if the action was a returned value
-        if let ::std::option::Option::Some(duration_str) = should_sleep {
-            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
-        }
-    }};
-    ($name:literal, $cancel:expr) => {{
-        // If the failpoint is used with a "return" action, set should_sleep to the
-        // returned value (as string). Otherwise it's set to None.
-        let should_sleep = (|| {
-            ::fail::fail_point!($name, |x| x);
-            ::std::option::Option::None
-        })();
-
-        // Sleep if the action was a returned value
-        if let ::std::option::Option::Some(duration_str) = should_sleep {
-            $crate::failpoint_support::failpoint_sleep_cancellable_helper(
-                $name,
-                duration_str,
-                $cancel,
-            )
-            .await
-        }
-    }};
-}
-pub use __failpoint_sleep_millis_async as sleep_millis_async;
-
-// Helper function used by the macro. (A function has nicer scoping so we
-// don't need to decorate everything with "::")
-#[doc(hidden)]
-pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
-    let millis = duration_str.parse::<u64>().unwrap();
-    let d = std::time::Duration::from_millis(millis);
-
-    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-    tokio::time::sleep(d).await;
-    tracing::info!("failpoint {:?}: sleep done", name);
-}
-
-// Helper function used by the macro. (A function has nicer scoping so we
-// don't need to decorate everything with "::")
-#[doc(hidden)]
-pub async fn failpoint_sleep_cancellable_helper(
-    name: &'static str,
-    duration_str: String,
-    cancel: &CancellationToken,
-) {
-    let millis = duration_str.parse::<u64>().unwrap();
-    let d = std::time::Duration::from_millis(millis);
-
-    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-    tokio::time::timeout(d, cancel.cancelled()).await.ok();
-    tracing::info!("failpoint {:?}: sleep done", name);
-}
-
-pub fn init() -> fail::FailScenario<'static> {
-    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
-    // We want non-default behavior for `exit`, though, so, we handle it separately.
-    //
-    // Format for FAILPOINTS is "name=actions" separated by ";".
-    let actions = std::env::var("FAILPOINTS");
-    if actions.is_ok() {
-        std::env::remove_var("FAILPOINTS");
-    } else {
-        // let the library handle non-utf8, or nothing for not present
-    }
-
-    let scenario = fail::FailScenario::setup();
-
-    if let Ok(val) = actions {
-        val.split(';')
-            .enumerate()
-            .map(|(i, s)| s.split_once('=').ok_or((i, s)))
-            .for_each(|res| {
-                let (name, actions) = match res {
-                    Ok(t) => t,
-                    Err((i, s)) => {
-                        panic!(
-                            "startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
-                            i + 1,
-                        );
-                    }
-                };
-                if let Err(e) = apply_failpoint(name, actions) {
-                    panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
-                }
-            });
-    }
-
-    scenario
-}
-
-pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
-    if actions == "exit" {
-        fail::cfg_callback(name, exit_failpoint)
-    } else {
-        fail::cfg(name, actions)
-    }
-}
-
-#[inline(never)]
-fn exit_failpoint() {
-    tracing::info!("Exit requested by failpoint");
-    std::process::exit(1);
-}
-
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
-/// Configure failpoints through http.
-pub async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because storage was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow::anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,9 +31,6 @@ pub enum ApiError {
    #[error("Shutting down")]
    ShuttingDown,

-    #[error("Timeout")]
-    Timeout(Cow<'static, str>),
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -70,10 +67,6 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
-            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
-                StatusCode::REQUEST_TIMEOUT,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,10 +83,6 @@ pub mod timeout;

 pub mod sync;

-pub mod failpoint_support;
-
-pub mod yielding_loop;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,12 +15,6 @@ pub struct Gate {
    name: String,
 }

-impl std::fmt::Debug for Gate {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
-    }
-}
-
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -1,35 +0,0 @@
-use tokio_util::sync::CancellationToken;
-
-#[derive(thiserror::Error, Debug)]
-pub enum YieldingLoopError {
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
-#[inline(always)]
-pub async fn yielding_loop<I, T, F>(
-    interval: usize,
-    cancel: &CancellationToken,
-    iter: I,
-    mut visitor: F,
-) -> Result<(), YieldingLoopError>
-where
-    I: Iterator<Item = T>,
-    F: FnMut(T),
-{
-    for (i, item) in iter.enumerate() {
-        visitor(item);
-
-        if i + 1 % interval == 0 {
-            tokio::task::yield_now().await;
-            if cancel.is_cancelled() {
-                return Err(YieldingLoopError::Cancelled);
-            }
-        }
-    }
-
-    Ok(())
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -446,11 +446,12 @@ impl Runner {
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            // *Ideally* we'd like to log here that we're ignoring the fact the
-                            // memory stats are too high, but in practice this can result in
-                            // spamming the logs with repetitive messages about ignoring the signal
-                            //
-                            // See https://github.com/neondatabase/neon/issues/5865 for more.
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                            continue;
                        }
                    }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -425,7 +425,7 @@ mod tests {
        }

        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("wp_log[{}] {}", level, msg);
+            println!("walprop_log[{}] {}", level, msg);
        }

        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
 use pageserver::{
    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};

 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
-    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+    let tenant_id = TenantId::generate();

-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = PostgresRedoManager::new(conf, tenant_id);

    let manager = Arc::new(manager);

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use pageserver_api::{models::*, shard::TenantShardId};
+use pageserver_api::models::*;
 use reqwest::{IntoUrl, Method};
 use utils::{
    http::error::HttpErrorBody,
@@ -28,12 +28,14 @@ pub enum Error {

 pub type Result<T> = std::result::Result<T, Error>;

-pub(crate) trait ResponseErrorMessageExt: Sized {
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
    async fn error_from_body(self) -> Result<Self>;
 }

+#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(self) -> Result<Self> {
+    async fn error_from_body(mut self) -> Result<Self> {
        let status = self.status();
        if !(status.is_client_error() || status.is_server_error()) {
            return Ok(self);
@@ -49,11 +51,6 @@ impl ResponseErrorMessageExt for reqwest::Response {
    }
 }

-pub enum ForceAwaitLogicalSize {
-    Yes,
-    No,
-}
-
 impl Client {
    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
        Self {
@@ -97,18 +94,11 @@ impl Client {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );
-
-        let uri = match force_await_logical_size {
-            ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
-            ForceAwaitLogicalSize::No => uri,
-        };
-
        self.get(&uri)
            .await?
            .json()
@@ -174,18 +164,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{}/secondary/download",
-            self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ())
-            .await?
-            .error_for_status()
-            .map(|_| ())
-            .map_err(|e| Error::ApiError(format!("{}", e)))
-    }
-
    pub async fn location_config(
        &self,
        tenant_id: TenantId,
@@ -221,16 +199,4 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
-
-    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{}/reset",
-            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        self.request(Method::POST, &uri, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,8 +115,15 @@ impl PagestreamClient {

    pub async fn getpage(
        &mut self,
-        req: PagestreamGetPageRequest,
+        key: RelTagBlockNo,
+        lsn: Lsn,
    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
        // let mut req = tokio_util::io::ReaderStream::new(&req);
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-camino.workspace = true
 clap.workspace = true
 futures.workspace = true
 hdrhistogram.workspace = true
@@ -19,7 +18,6 @@ serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true

 pageserver = { path = ".." }
 pageserver_client.workspace = true
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,5 +1,4 @@
 use anyhow::Context;
-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;

 use utils::id::TenantTimelineId;
@@ -93,12 +92,10 @@ async fn main_impl(
    for timeline in &timelines {
        js.spawn({
            let timeline = *timeline;
+            // FIXME: this triggers initial logical size calculation
+            // https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(
-                    timeline.tenant_id,
-                    timeline.timeline_id,
-                    ForceAwaitLogicalSize::No,
-                )
+                .timeline_info(timeline.tenant_id, timeline.timeline_id)
                .await
                .unwrap();
            async move {
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,13 +1,10 @@
 use anyhow::Context;
-use camino::Utf8PathBuf;
 use futures::future::join_all;
 use pageserver::pgdatadir_mapping::key_to_rel_block;
 use pageserver::repository;
 use pageserver_api::key::is_rel_block_key;
-use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::PagestreamGetPageRequest;
+use pageserver_client::page_service::RelTagBlockNo;

-use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

@@ -16,7 +13,7 @@ use tokio::sync::Barrier;
 use tokio::task::JoinSet;
 use tracing::{info, instrument};

-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -42,17 +39,8 @@ pub(crate) struct Args {
    runtime: Option<humantime::Duration>,
    #[clap(long)]
    per_target_rate_limit: Option<usize>,
-    /// Probability for sending `latest=true` in the request (uniform distribution).
-    #[clap(long, default_value = "1")]
-    req_latest_probability: f64,
    #[clap(long)]
    limit_to_first_n_targets: Option<usize>,
-    /// For large pageserver installations, enumerating the keyspace takes a lot of time.
-    /// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
-    /// The cache is tagged and auto-invalided by the tenant/timeline ids only.
-    /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
-    #[clap(long)]
-    keyspace_cache: Option<Utf8PathBuf>,
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -67,7 +55,7 @@ impl LiveStats {
    }
 }

-#[derive(Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Clone)]
 struct KeyRange {
    timeline: TenantTimelineId,
    timeline_lsn: Lsn,
@@ -115,107 +103,59 @@ async fn main_impl(
    )
    .await?;

-    #[derive(serde::Deserialize)]
-    struct KeyspaceCacheDe {
-        tag: Vec<TenantTimelineId>,
-        data: Vec<KeyRange>,
-    }
-    #[derive(serde::Serialize)]
-    struct KeyspaceCacheSer<'a> {
-        tag: &'a [TenantTimelineId],
-        data: &'a [KeyRange],
-    }
-    let cache = args
-        .keyspace_cache
-        .as_ref()
-        .map(|keyspace_cache_file| {
-            let contents = match std::fs::read(keyspace_cache_file) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    return anyhow::Ok(None);
-                }
-                x => x.context("read keyspace cache file")?,
-            };
-            let cache: KeyspaceCacheDe =
-                serde_json::from_slice(&contents).context("deserialize cache file")?;
-            let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
-                == HashSet::from_iter(timelines.iter().cloned());
-            info!("keyspace cache file matches tag: {tag_ok}");
-            anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
-        })
-        .transpose()?
-        .flatten();
-    let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
-        info!("using keyspace cache file");
-        cached
-    } else {
-        let mut js = JoinSet::new();
-        for timeline in &timelines {
-            js.spawn({
-                let mgmt_api_client = Arc::clone(&mgmt_api_client);
-                let timeline = *timeline;
-                async move {
-                    let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
-                        .await?;
-                    let lsn = partitioning.at_lsn;
-                    let start = Instant::now();
-                    let mut filtered = KeySpaceAccum::new();
-                    // let's hope this is inlined and vectorized...
-                    // TODO: turn this loop into a is_rel_block_range() function.
-                    for r in partitioning.keys.ranges.iter() {
-                        let mut i = r.start;
-                        while i != r.end {
-                            if is_rel_block_key(&i) {
-                                filtered.add_key(i);
-                            }
-                            i = i.next();
-                        }
-                    }
-                    let filtered = filtered.to_keyspace();
-                    let filter_duration = start.elapsed();
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(&mgmt_api_client);
+            let timeline = *timeline;
+            async move {
+                let partitioning = mgmt_api_client
+                    .keyspace(timeline.tenant_id, timeline.timeline_id)
+                    .await?;
+                let lsn = partitioning.at_lsn;

-                    anyhow::Ok((
-                        filter_duration,
-                        filtered.ranges.into_iter().map(move |r| KeyRange {
-                            timeline,
-                            timeline_lsn: lsn,
-                            start: r.start.to_i128(),
-                            end: r.end.to_i128(),
-                        }),
-                    ))
-                }
-            });
-        }
-        let mut total_filter_duration = Duration::from_secs(0);
-        let mut all_ranges: Vec<KeyRange> = Vec::new();
-        while let Some(res) = js.join_next().await {
-            let (filter_duration, range) = res.unwrap().unwrap();
-            all_ranges.extend(range);
-            total_filter_duration += filter_duration;
-        }
-        info!("filter duration: {}", total_filter_duration.as_secs_f64());
-        if let Some(cachefile) = args.keyspace_cache.as_ref() {
-            let cache = KeyspaceCacheSer {
-                tag: &timelines,
-                data: &all_ranges,
-            };
-            let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
-            std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
-            info!("successfully wrote keyspace cache file");
-        }
-        all_ranges
-    };
+                let ranges = partitioning
+                    .keys
+                    .ranges
+                    .iter()
+                    .filter_map(|r| {
+                        let start = r.start;
+                        let end = r.end;
+                        // filter out non-relblock keys
+                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
+                            (true, true) => Some(KeyRange {
+                                timeline,
+                                timeline_lsn: lsn,
+                                start: start.to_i128(),
+                                end: end.to_i128(),
+                            }),
+                            (true, false) | (false, true) => {
+                                unimplemented!("split up range")
+                            }
+                            (false, false) => None,
+                        }
+                    })
+                    .collect::<Vec<_>>();
+
+                anyhow::Ok(ranges)
+            }
+        });
+    }
+    let mut all_ranges: Vec<KeyRange> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_ranges.extend(res.unwrap().unwrap());
+    }

    let live_stats = Arc::new(LiveStats::default());

    let num_client_tasks = timelines.len();
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = 1;
-    let num_main_impl = 1;

    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));

    tokio::spawn({
        let stats = Arc::clone(&live_stats);
@@ -235,143 +175,112 @@ async fn main_impl(
        }
    });

-    let cancel = CancellationToken::new();
-
-    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
+    let mut work_senders = HashMap::new();
    let mut tasks = Vec::new();
    for tl in &timelines {
        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(*tl, sender);
+        work_senders.insert(tl, sender);
        tasks.push(tokio::spawn(client(
            args,
            *tl,
            Arc::clone(&start_work_barrier),
            receiver,
+            Arc::clone(&all_work_done_barrier),
            Arc::clone(&live_stats),
-            cancel.clone(),
        )));
    }

-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
-        let start_work_barrier = start_work_barrier.clone();
-        let cancel = cancel.clone();
-        match args.per_target_rate_limit {
-            None => Box::pin(async move {
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
+        None => Box::pin(async move {
+            let weights = rand::distributions::weighted::WeightedIndex::new(
+                all_ranges.iter().map(|v| v.len()),
+            )
+            .unwrap();
+
+            start_work_barrier.wait().await;
+
+            loop {
+                let (range, key) = {
+                    let mut rng = rand::thread_rng();
+                    let r = &all_ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = repository::Key::from_i128(key);
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    (r, RelTagBlockNo { rel_tag, block_no })
+                };
+                let sender = work_senders.get(&range.timeline).unwrap();
+                // TODO: what if this blocks?
+                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+            }
+        }),
+        Some(rps_limit) => Box::pin(async move {
+            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+
+            let make_timeline_task: &dyn Fn(
+                TenantTimelineId,
+            )
+                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                let sender = work_senders.get(&timeline).unwrap();
+                let ranges: Vec<KeyRange> = all_ranges
+                    .iter()
+                    .filter(|r| r.timeline == timeline)
+                    .cloned()
+                    .collect();
                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    all_ranges.iter().map(|v| v.len()),
+                    ranges.iter().map(|v| v.len()),
                )
                .unwrap();

-                start_work_barrier.wait().await;
-
-                while !cancel.is_cancelled() {
-                    let (timeline, req) = {
-                        let mut rng = rand::thread_rng();
-                        let r = &all_ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = repository::Key::from_i128(key);
-                        let (rel_tag, block_no) =
-                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                        (
-                            r.timeline,
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            },
-                        )
-                    };
-                    let sender = work_senders.get(&timeline).unwrap();
-                    // TODO: what if this blocks?
-                    if sender.send(req).await.is_err() {
-                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
+                Box::pin(async move {
+                    let mut ticker = tokio::time::interval(period);
+                    ticker.set_missed_tick_behavior(
+                        /* TODO review this choice */
+                        tokio::time::MissedTickBehavior::Burst,
+                    );
+                    loop {
+                        ticker.tick().await;
+                        let (range, key) = {
+                            let mut rng = rand::thread_rng();
+                            let r = &ranges[weights.sample(&mut rng)];
+                            let key: i128 = rng.gen_range(r.start..r.end);
+                            let key = repository::Key::from_i128(key);
+                            let (rel_tag, block_no) = key_to_rel_block(key)
+                                .expect("we filter non-rel-block keys out above");
+                            (r, RelTagBlockNo { rel_tag, block_no })
+                        };
+                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
                    }
-                }
-            }),
-            Some(rps_limit) => Box::pin(async move {
-                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_timeline_task: &dyn Fn(
-                    TenantTimelineId,
-                )
-                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                    let sender = work_senders.get(&timeline).unwrap();
-                    let ranges: Vec<KeyRange> = all_ranges
-                        .iter()
-                        .filter(|r| r.timeline == timeline)
-                        .cloned()
-                        .collect();
-                    let weights = rand::distributions::weighted::WeightedIndex::new(
-                        ranges.iter().map(|v| v.len()),
-                    )
-                    .unwrap();
+                })
+            };

-                    let cancel = cancel.clone();
-                    Box::pin(async move {
-                        let mut ticker = tokio::time::interval(period);
-                        ticker.set_missed_tick_behavior(
-                            /* TODO review this choice */
-                            tokio::time::MissedTickBehavior::Burst,
-                        );
-                        while !cancel.is_cancelled() {
-                            ticker.tick().await;
-                            let req = {
-                                let mut rng = rand::thread_rng();
-                                let r = &ranges[weights.sample(&mut rng)];
-                                let key: i128 = rng.gen_range(r.start..r.end);
-                                let key = repository::Key::from_i128(key);
-                                assert!(is_rel_block_key(&key));
-                                let (rel_tag, block_no) = key_to_rel_block(key)
-                                    .expect("we filter non-rel-block keys out above");
-                                PagestreamGetPageRequest {
-                                    latest: rng.gen_bool(args.req_latest_probability),
-                                    lsn: r.timeline_lsn,
-                                    rel: rel_tag,
-                                    blkno: block_no,
-                                }
-                            };
-                            if sender.send(req).await.is_err() {
-                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
-                            }
-                        }
-                    })
-                };
+            let tasks: Vec<_> = work_senders
+                .keys()
+                .map(|tl| make_timeline_task(**tl))
+                .collect();

-                let tasks: Vec<_> = work_senders
-                    .keys()
-                    .map(|tl| make_timeline_task(*tl))
-                    .collect();
+            start_work_barrier.wait().await;

-                start_work_barrier.wait().await;
-
-                join_all(tasks).await;
-            }),
-        }
+            join_all(tasks).await;
+        }),
    };

-    let work_sender_task = tokio::spawn(work_sender);
-
    if let Some(runtime) = args.runtime {
-        info!("waiting for everything to become ready");
-        start_work_barrier.wait().await;
-        info!("work started");
-        tokio::time::sleep(runtime.into()).await;
-        info!("runtime over, signalling cancellation");
-        cancel.cancel();
-        work_sender_task.await.unwrap();
-        info!("work sender exited");
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
    } else {
-        work_sender_task.await.unwrap();
+        work_sender.await;
        unreachable!("work sender never terminates");
    }

-    info!("joining clients");
    for t in tasks {
        t.await.unwrap();
    }

-    info!("all clients stopped");
-
    let output = Output {
        total: {
            let mut agg_stats = request_stats::Stats::new();
@@ -394,10 +303,12 @@ async fn client(
    args: &'static Args,
    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
+    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    all_work_done_barrier: Arc<Barrier>,
    live_stats: Arc<LiveStats>,
-    cancel: CancellationToken,
 ) {
+    start_work_barrier.wait().await;
+
    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
        .await
        .unwrap();
@@ -406,18 +317,12 @@ async fn client(
        .await
        .unwrap();

-    start_work_barrier.wait().await;
-
-    while let Some(req) =
-        tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
-    {
+    while let Some((key, lsn)) = work.recv().await {
        let start = Instant::now();
-
-        let res = tokio::select! {
-            res = client.getpage(req) => { res },
-            _ = cancel.cancelled() => { return; }
-        };
-        res.with_context(|| format!("getpage for {timeline}"))
+        client
+            .getpage(key, lsn)
+            .await
+            .with_context(|| format!("getpage for {timeline}"))
            .unwrap();
        let elapsed = start.elapsed();
        live_stats.inc();
@@ -425,4 +330,6 @@ async fn client(
            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
        });
    }
+
+    all_work_done_barrier.wait().await;
 }
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -4,8 +4,6 @@ use humantime::Duration;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
-
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
@@ -58,15 +56,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    for tl in timelines {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
+            // TODO: API to explicitly trigger initial logical size computation.
+            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
+            // => https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(tl.tenant_id, tl.timeline_id)
                .await
                .unwrap();

-            // Polling should not be strictly required here since we await
-            // for the initial logical size, however it's possible for the request
-            // to land before the timeline is initialised. This results in an approximate
-            // logical size.
            if let Some(period) = args.poll_for_completion {
                let mut ticker = tokio::time::interval(period.into());
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
@@ -74,7 +71,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(tl.tenant_id, tl.timeline_id)
                        .await
                        .unwrap();
                }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,7 +23,6 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
-use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

@@ -175,7 +174,7 @@ where
        ] {
            for segno in self
                .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
+                .list_slru_segments(kind, self.lsn, self.ctx)
                .await?
            {
                self.add_slru_segment(kind, segno).await?;
@@ -193,7 +192,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -268,7 +267,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -289,7 +288,7 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }
@@ -311,7 +310,7 @@ where
    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
            .await?;

        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -353,7 +352,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;

            ensure!(
@@ -400,7 +399,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = failpoint_support::init();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,9 +75,6 @@ pub mod defaults {
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

    ///
    /// Default built-in configuration file.
@@ -91,7 +88,6 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'

-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}

 # initial superuser role name to use when creating a new tenant
@@ -112,8 +108,6 @@ pub mod defaults {

 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'

-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -131,7 +125,6 @@ pub mod defaults {
 #gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

 [remote_storage]

@@ -240,13 +233,6 @@ pub struct PageServerConf {
    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
    /// heatmap uploads vs. other remote storage operations.
    pub heatmap_upload_concurrency: usize,
-
-    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
-    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
-    pub secondary_download_concurrency: usize,
-
-    /// Maximum number of WAL records to be ingested and committed at the same time
-    pub ingest_batch_size: u64,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -328,9 +314,6 @@ struct PageServerConfigBuilder {
    control_plane_emergency_mode: BuilderValue<bool>,

    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
 }

 impl Default for PageServerConfigBuilder {
@@ -403,9 +386,6 @@ impl Default for PageServerConfigBuilder {
            control_plane_emergency_mode: Set(false),

            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
        }
    }
 }
@@ -554,14 +534,6 @@ impl PageServerConfigBuilder {
        self.heatmap_upload_concurrency = BuilderValue::Set(value)
    }

-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -660,15 +632,10 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
        })
    }
 }
@@ -726,11 +693,6 @@ impl PageServerConf {
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

-    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(TENANT_HEATMAP_BASENAME)
-    }
-
    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TIMELINES_SEGMENT_NAME)
@@ -916,10 +878,6 @@ impl PageServerConf {
                "heatmap_upload_concurrency" => {
                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -991,8 +949,6 @@ impl PageServerConf {
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
        }
    }
 }
@@ -1221,9 +1177,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1284,9 +1238,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,6 +1,5 @@
 use std::collections::HashMap;

-use futures::Future;
 use pageserver_api::{
    control_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -29,14 +28,13 @@ pub enum RetryForeverError {
    ShuttingDown,
 }

+#[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    fn re_attach(
-        &self,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
-    fn validate(
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
+    async fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
 }

 impl ControlPlaneClient {
@@ -125,6 +123,7 @@ impl ControlPlaneClient {
    }
 }

+#[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,6 +831,7 @@ mod test {
        }
    }

+    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -0,0 +1,86 @@
+/// use with fail::cfg("$name", "return(2000)")
+///
+/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+/// specified time (in milliseconds). The main difference is that we use async
+/// tokio sleep function. Another difference is that we print lines to the log,
+/// which can be useful in tests to check that the failpoint was hit.
+#[macro_export]
+macro_rules! __failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        // If the failpoint is used with a "return" action, set should_sleep to the
+        // returned value (as string). Otherwise it's set to None.
+        let should_sleep = (|| {
+            ::fail::fail_point!($name, |x| x);
+            ::std::option::Option::None
+        })();
+
+        // Sleep if the action was a returned value
+        if let ::std::option::Option::Some(duration_str) = should_sleep {
+            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
+        }
+    }};
+}
+pub use __failpoint_sleep_millis_async as sleep_millis_async;
+
+// Helper function used by the macro. (A function has nicer scoping so we
+// don't need to decorate everything with "::")
+#[doc(hidden)]
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+    let millis = duration_str.parse::<u64>().unwrap();
+    let d = std::time::Duration::from_millis(millis);
+
+    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+    tokio::time::sleep(d).await;
+    tracing::info!("failpoint {:?}: sleep done", name);
+}
+
+pub fn init() -> fail::FailScenario<'static> {
+    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
+    // We want non-default behavior for `exit`, though, so, we handle it separately.
+    //
+    // Format for FAILPOINTS is "name=actions" separated by ";".
+    let actions = std::env::var("FAILPOINTS");
+    if actions.is_ok() {
+        std::env::remove_var("FAILPOINTS");
+    } else {
+        // let the library handle non-utf8, or nothing for not present
+    }
+
+    let scenario = fail::FailScenario::setup();
+
+    if let Ok(val) = actions {
+        val.split(';')
+            .enumerate()
+            .map(|(i, s)| s.split_once('=').ok_or((i, s)))
+            .for_each(|res| {
+                let (name, actions) = match res {
+                    Ok(t) => t,
+                    Err((i, s)) => {
+                        panic!(
+                            "startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
+                            i + 1,
+                        );
+                    }
+                };
+                if let Err(e) = apply_failpoint(name, actions) {
+                    panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
+                }
+            });
+    }
+
+    scenario
+}
+
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+    if actions == "exit" {
+        fail::cfg_callback(name, exit_failpoint)
+    } else {
+        fail::cfg(name, actions)
+    }
+}
+
+#[inline(never)]
+fn exit_failpoint() {
+    tracing::info!("Exit requested by failpoint");
+    std::process::exit(1);
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -15,7 +15,6 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::TenantDetails;
-use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
@@ -26,7 +25,6 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
-use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -38,7 +36,6 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::UpsertLocationError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
@@ -48,8 +45,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::SpawnMode;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
@@ -70,6 +66,9 @@ use utils::{
    lsn::Lsn,
 };

+// Imports only used for testing APIs
+use pageserver_api::models::ConfigureFailpointsRequest;
+
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -115,6 +114,14 @@ impl State {
            secondary_controller,
        })
    }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }

 #[inline(always)]
@@ -147,7 +154,6 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
-            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
@@ -170,7 +176,7 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
+            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -189,18 +195,6 @@ impl From<TenantSlotUpsertError> for ApiError {
    }
 }

-impl From<UpsertLocationError> for ApiError {
-    fn from(e: UpsertLocationError) -> ApiError {
-        use UpsertLocationError::*;
-        match e {
-            BadRequest(e) => ApiError::BadRequest(e),
-            Unavailable(_) => ApiError::ShuttingDown,
-            e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
-        }
-    }
-}
-
 impl From<TenantMapError> for ApiError {
    fn from(e: TenantMapError) -> ApiError {
        use TenantMapError::*;
@@ -323,21 +317,11 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
-    force_await_initial_logical_size: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    if force_await_initial_logical_size {
-        timeline.clone().await_initial_logical_size().await
-    }
-
-    let mut info = build_timeline_info_common(
-        timeline,
-        ctx,
-        tenant::timeline::GetLogicalSizePriority::Background,
-    )
-    .await?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -354,7 +338,6 @@ async fn build_timeline_info(
 async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
-    logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let initdb_lsn = timeline.initdb_lsn;
@@ -377,7 +360,8 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
+    let current_logical_size =
+        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -488,7 +472,7 @@ async fn timeline_create_handler(
        .await {
            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
@@ -524,8 +508,6 @@ async fn timeline_list_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    let force_await_initial_logical_size: Option<bool> =
-        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -539,7 +521,6 @@ async fn timeline_list_handler(
            let timeline_info = build_timeline_info(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
-                force_await_initial_logical_size.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -567,8 +548,6 @@ async fn timeline_detail_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    let force_await_initial_logical_size: Option<bool> =
-        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -584,7 +563,6 @@ async fn timeline_detail_handler(
        let timeline_info = build_timeline_info(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
-            force_await_initial_logical_size.unwrap_or(false),
            &ctx,
        )
        .await
@@ -703,37 +681,16 @@ async fn tenant_attach_handler(
        )));
    }

-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-    let tenant = state
-        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
-        .await?;
-
-    let Some(tenant) = tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-
-    // We might have successfully constructed a Tenant, but it could still
-    // end up in a broken state:
-    if let TenantState::Broken {
-        reason,
-        backtrace: _,
-    } = tenant.current_state()
-    {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Tenant state is Broken: {reason}"
-        )));
-    }
+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -1192,25 +1149,16 @@ async fn tenant_create_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-
-    let new_tenant = state
-        .tenant_manager
-        .upsert_location(
-            target_tenant_id,
-            location_conf,
-            None,
-            SpawnMode::Create,
-            &ctx,
-        )
-        .await?;
-
-    let Some(new_tenant) = new_tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
+    let new_tenant = mgr::create_tenant(
+        state.conf,
+        tenant_conf,
+        target_tenant_id,
+        generation,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
+    .await?;

    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
@@ -1219,7 +1167,7 @@ async fn tenant_create_handler(
        .await
    {
        // This shouldn't happen because we just created the tenant directory
-        // in upsert_location, and there aren't any remote timelines
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
        // to load, so, nothing can really fail during load.
        // Don't do cleanup because we don't know how we got here.
        // The tenant will likely be in `Broken` state and subsequent
@@ -1320,31 +1268,12 @@ async fn put_tenant_location_config_handler(

    state
        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
-        .await?;
-
-    if let Some(_flush_ms) = flush {
-        match state
-            .secondary_controller
-            .upload_tenant(tenant_shard_id)
-            .await
-        {
-            Ok(()) => {
-                tracing::info!("Uploaded heatmap during flush");
-            }
-            Err(e) => {
-                tracing::warn!("Failed to flush heatmap: {e}");
-            }
-        }
-    } else {
-        tracing::info!("No flush requested when configuring");
-    }
+        .upsert_location(tenant_shard_id, location_conf, flush, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1364,6 +1293,34 @@ async fn handle_tenant_break(
    json_response(StatusCode::OK, ())
 }

+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Cannot manage failpoints because pageserver was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
@@ -1683,21 +1640,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn secondary_download_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1966,9 +1908,6 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
-            api_handler(r, secondary_download_handler)
-        })
        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,7 +21,6 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -313,16 +312,13 @@ async fn import_wal(
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(endpoint);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                WAL_INGEST.records_committed.inc();
-
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                nrecords += 1;
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(

        waldecoder.feed_bytes(&bytes[offset..]);

-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(end_lsn);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,6 +25,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

-/// Per-tenant copy of their remote heatmap, downloaded into the local
-/// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum StorageTimeOperation {
+pub enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
    LayerFlush,

@@ -55,7 +55,7 @@ pub(crate) enum StorageTimeOperation {
    CreateTenant,
 }

-pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) struct PageCacheMetricsForTaskKind {
+pub struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,

@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

-pub(crate) struct PageCacheMetrics {
+pub struct PageCacheMetrics {
    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }

@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
    }
 }

-pub(crate) struct PageCacheSizeMetrics {
+pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

+    pub current_bytes_ephemeral: UIntGauge,
    pub current_bytes_immutable: UIntGauge,
    pub current_bytes_materialized_page: UIntGauge,
 }
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
-    Lazy::new(|| PageCacheSizeMetrics {
-        max_bytes: {
-            register_uint_gauge!(
-                "pageserver_page_cache_size_max_bytes",
-                "Maximum size of the page cache in bytes"
-            )
-            .expect("failed to define a metric")
-        },
-        current_bytes_immutable: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["immutable"])
-                .unwrap()
-        },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
-    });
+pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
+    max_bytes: {
+        register_uint_gauge!(
+            "pageserver_page_cache_size_max_bytes",
+            "Maximum size of the page cache in bytes"
+        )
+        .expect("failed to define a metric")
+    },
+
+    current_bytes_ephemeral: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+    current_bytes_immutable: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+    current_bytes_materialized_page: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+});

 pub(crate) mod page_cache_eviction_metrics {
    use std::num::NonZeroUsize;
@@ -734,13 +740,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub(crate) struct EvictionsWithLowResidenceDuration {
+pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
    threshold: Duration,
    counter: Option<IntCounter>,
 }

-pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
+pub struct EvictionsWithLowResidenceDurationBuilder {
    data_source: &'static str,
    threshold: Duration,
 }
@@ -1003,7 +1009,7 @@ pub enum SmgrQueryType {
 }

 #[derive(Debug)]
-pub(crate) struct SmgrQueryTimePerTimeline {
+pub struct SmgrQueryTimePerTimeline {
    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }

@@ -1175,8 +1181,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime(HistogramVec);
-pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub struct BasebackupQueryTime(HistogramVec);
+pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
@@ -1196,7 +1202,7 @@ impl DurationResultObserver for BasebackupQueryTime {
    }
 }

-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
        "Number of live network connections",
@@ -1363,8 +1369,6 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
    pub(crate) upload_heatmap_duration: Histogram,
-    pub(crate) download_heatmap: IntCounter,
-    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
@@ -1382,16 +1386,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
-    download_heatmap: register_int_counter!(
-        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
-    download_layer: register_int_counter!(
-        "pageserver_secondary_download_layer",
-        "Number of downloads of layers by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1661,7 +1655,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub(crate) struct StorageTimeMetricsTimer {
+pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
 }
@@ -1686,7 +1680,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub(crate) struct StorageTimeMetrics {
+pub struct StorageTimeMetrics {
    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
    timeline_sum: Counter,
    /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1725,7 +1719,7 @@ impl StorageTimeMetrics {
 }

 #[derive(Debug)]
-pub(crate) struct TimelineMetrics {
+pub struct TimelineMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
@@ -1933,7 +1927,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
    }
 }

-pub(crate) struct RemoteTimelineClientMetrics {
+pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2231,7 +2225,7 @@ impl Drop for RemoteTimelineClientMetrics {

 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub(crate) trait MeasureRemoteOp: Sized {
+pub trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
        tenant_id: TenantId,
@@ -2256,7 +2250,7 @@ pub(crate) trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}

 pin_project! {
-    pub(crate) struct MeasuredRemoteOp<F>
+    pub struct MeasuredRemoteOp<F>
    {
        #[pin]
        inner: F,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
-use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
-use crate::tenant::timeline::WaitLsnError;
-use crate::tenant::GetTimelineError;
-use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -287,64 +283,6 @@ struct PageServerHandler {
    connection_ctx: RequestContext,
 }

-#[derive(thiserror::Error, Debug)]
-enum PageStreamError {
-    /// We encountered an error that should prompt the client to reconnect:
-    /// in practice this means we drop the connection without sending a response.
-    #[error("Reconnect required: {0}")]
-    Reconnect(Cow<'static, str>),
-
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
-
-    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
-
-    /// Ran out of time waiting for an LSN
-    #[error("LSN timeout: {0}")]
-    LsnTimeout(WaitLsnError),
-
-    /// The entity required to serve the request (tenant or timeline) is not found,
-    /// or is not found in a suitable state to serve a request.
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
-
-    /// Request asked for something that doesn't make sense, like an invalid LSN
-    #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
-}
-
-impl From<PageReconstructError> for PageStreamError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::Shutdown,
-            e => Self::Read(e),
-        }
-    }
-}
-
-impl From<GetActiveTimelineError> for PageStreamError {
-    fn from(value: GetActiveTimelineError) -> Self {
-        match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
-            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
-            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
-        }
-    }
-}
-
-impl From<WaitLsnError> for PageStreamError {
-    fn from(value: WaitLsnError) -> Self {
-        match value {
-            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
-            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
-        }
-    }
-}
-
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
@@ -490,7 +428,7 @@ impl PageServerHandler {
        // Check that the timeline exists
        let timeline = tenant
            .get_timeline(timeline_id, true)
-            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
+            .map_err(|e| anyhow::anyhow!(e))?;

        // Avoid starting new requests if the timeline has already started shutting down,
        // and block timeline shutdown until this request is complete, or drops out due
@@ -582,44 +520,32 @@ impl PageServerHandler {
                }
            };

-            match response {
-                Err(PageStreamError::Shutdown) => {
+            if let Err(e) = &response {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
+                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
                    return Err(QueryError::Shutdown);
                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
-                    });
-
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &timeline.cancel).await?;
-                }
            }
+
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // here includes cancellation which is not an error.
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });
+
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
+            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -766,7 +692,7 @@ impl PageServerHandler {
        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
-    ) -> Result<Lsn, PageStreamError> {
+    ) -> anyhow::Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -797,19 +723,15 @@ impl PageServerHandler {
            }
        } else {
            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
+                anyhow::bail!("invalid LSN(0) in request");
            }
            timeline.wait_lsn(lsn, ctx).await?;
        }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
+        anyhow::ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
        Ok(lsn)
    }

@@ -818,14 +740,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -838,15 +760,13 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

-        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
@@ -858,20 +778,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -880,35 +794,30 @@ impl PageServerHandler {
        }))
    }

-    async fn do_handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
-        req: &PagestreamGetPageRequest,
-        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
-
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
-    }
-
    async fn handle_get_page_at_lsn_request(
        &self,
        timeline: &Timeline,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+        /*
+        // Add a 1s delay to some requests. The delay helps the requests to
+        // hit the race condition from github issue #1047 more easily.
+        use rand::Rng;
+        if rand::thread_rng().gen::<u8>() < 5 {
+            std::thread::sleep(std::time::Duration::from_millis(1000));
+        }
+        */
+
        let key = rel_block_to_key(req.rel, req.blkno);
-        if timeline.get_shard_identity().is_key_local(&key) {
-            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
-                .await
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
        } else {
            // The Tenant shard we looked up at connection start does not hold this particular
            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -927,30 +836,30 @@ impl PageServerHandler {
                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                    // We already know this tenant exists in general, because we resolved it at
                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                    // mapping is out of date.
-                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
-                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
-                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                    // and talk to a different pageserver.
-                    return Err(PageStreamError::Reconnect(
-                        "getpage@lsn request routed to wrong shard".into(),
-                    ));
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
                }
                Err(e) => return Err(e.into()),
            };

            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline
-                .gate
-                .enter()
-                .map_err(|_| PageStreamError::Shutdown)?;
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        };

-            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
-                .await
-        }
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
    }

    #[allow(clippy::too_many_arguments)]
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
        Ok(timeline)
    }
 }
@@ -1500,8 +1411,7 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::Cancelled
-            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                QueryError::Shutdown
            }
            e => QueryError::Other(anyhow::anyhow!(e)),
@@ -1514,15 +1424,14 @@ enum GetActiveTimelineError {
    #[error(transparent)]
    Tenant(GetActiveTenantError),
    #[error(transparent)]
-    Timeline(#[from] GetTimelineError),
+    Timeline(anyhow::Error),
 }

 impl From<GetActiveTimelineError> for QueryError {
    fn from(e: GetActiveTimelineError) -> Self {
        match e {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
            GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
        }
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{ensure, Context};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,7 +147,6 @@ impl Timeline {
    {
        DatadirModification {
            tline: self,
-            pending_lsns: Vec::new(),
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
@@ -160,11 +159,11 @@ impl Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub(crate) async fn get_rel_page_at_lsn(
+    pub async fn get_rel_page_at_lsn(
        &self,
        tag: RelTag,
        blknum: BlockNumber,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -174,47 +173,44 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag,
-                blknum,
-                version.get_lsn(),
-                nblocks
+                tag, blknum, lsn, nblocks
            );
            return Ok(ZERO_PAGE.clone());
        }

        let key = rel_block_to_key(tag, blknum);
-        version.get(self, key, ctx).await
+        self.get(key, lsn, ctx).await
    }

    // Get size of a database in blocks
-    pub(crate) async fn get_db_size(
+    pub async fn get_db_size(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;

-        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
    }

    /// Get size of a relation file
-    pub(crate) async fn get_rel_size(
+    pub async fn get_rel_size(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
@@ -224,12 +220,12 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(nblocks);
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -239,7 +235,7 @@ impl Timeline {
        }

        let key = rel_size_to_key(tag);
-        let mut buf = version.get(self, key, ctx).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
        let nblocks = buf.get_u32_le();

        if latest {
@@ -250,16 +246,16 @@ impl Timeline {
            // latest=true, then it can not cause cache corruption, because with latest=true
            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+            self.update_cached_rel_size(tag, lsn, nblocks);
        }
        Ok(nblocks)
    }

    /// Does relation exist?
-    pub(crate) async fn get_rel_exists(
+    pub async fn get_rel_exists(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
@@ -270,12 +266,12 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(true);
        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -291,16 +287,16 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn list_rels(
+    pub async fn list_rels(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -319,7 +315,7 @@ impl Timeline {
    }

    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
+    pub async fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -332,29 +328,29 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_size(
+    pub async fn get_slru_segment_size(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = version.get(self, key, ctx).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_exists(
+    pub async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -372,7 +368,7 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub(crate) async fn find_lsn_for_timestamp(
+    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
@@ -452,7 +448,7 @@ impl Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    pub(crate) async fn is_latest_commit_timestamp_ge_than(
+    pub async fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -475,7 +471,7 @@ impl Timeline {
    /// Obtain the possible timestamp range for the given lsn.
    ///
    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub(crate) async fn get_timestamp_for_lsn(
+    pub async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -505,11 +501,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                .await?;
            for blknum in (0..nblocks).rev() {
                let clog_page = self
@@ -532,36 +528,36 @@ impl Timeline {
    }

    /// Get a list of SLRU segments
-    pub(crate) async fn list_slru_segments(
+    pub async fn list_slru_segments(
        &self,
        kind: SlruKind,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<u32>, PageReconstructError> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => Ok(dir.segments),
            Err(e) => Err(PageReconstructError::from(e)),
        }
    }

-    pub(crate) async fn get_relmap_file(
+    pub async fn get_relmap_file(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        Ok(buf)
    }

-    pub(crate) async fn list_dbdirs(
+    pub async fn list_dbdirs(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -575,7 +571,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_twophase_file(
+    pub async fn get_twophase_file(
        &self,
        xid: TransactionId,
        lsn: Lsn,
@@ -586,7 +582,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub(crate) async fn list_twophase_files(
+    pub async fn list_twophase_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -600,7 +596,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_control_file(
+    pub async fn get_control_file(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -608,7 +604,7 @@ impl Timeline {
        self.get(CONTROLFILE_KEY, lsn, ctx).await
    }

-    pub(crate) async fn get_checkpoint(
+    pub async fn get_checkpoint(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -616,7 +612,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub(crate) async fn list_aux_files(
+    pub async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -656,10 +652,7 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
-                .await?
-            {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -699,7 +692,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                .await?
                .into_iter()
                .collect();
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
    /// in the state in 'tline' yet.
    pub tline: &'a Timeline,

-    /// Current LSN of the modification
-    lsn: Lsn,
+    /// Lsn assigned by begin_modification
+    pub lsn: Lsn,

    // The modifications are not applied directly to the underlying key-value store.
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
-    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
-    pending_deletions: Vec<(Range<Key>, Lsn)>,
+    pending_updates: HashMap<Key, Value>,
+    pending_deletions: Vec<Range<Key>>,
    pending_nblocks: i64,
 }

 impl<'a> DatadirModification<'a> {
-    /// Get the current lsn
-    pub(crate) fn get_lsn(&self) -> Lsn {
-        self.lsn
-    }
-
-    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
-            lsn >= self.lsn,
-            "setting an older lsn {} than {} is not allowed",
-            lsn,
-            self.lsn
-        );
-        if lsn > self.lsn {
-            self.pending_lsns.push(self.lsn);
-            self.lsn = lsn;
-        }
-        Ok(())
-    }
-
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1012,9 +984,11 @@ impl<'a> DatadirModification<'a> {
        dbnode: Oid,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let req_lsn = self.tline.get_last_record_lsn();
+
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1103,11 +1077,8 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        if self
-            .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
-            .await?
-        {
+        let last_lsn = self.tline.get_last_record_lsn();
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1352,23 +1323,17 @@ impl<'a> DatadirModification<'a> {
        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
-                } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
-                }
+        let mut retained_pending_updates = HashMap::new();
+        for (key, value) in self.pending_updates.drain() {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
+                // This bails out on first error without modifying pending_updates.
+                // That's Ok, cf this function's doc comment.
+                writer.put(key, self.lsn, &value, ctx).await?;
+            } else {
+                retained_pending_updates.insert(key, value);
            }
        }
-
-        self.pending_updates = retained_pending_updates;
+        self.pending_updates.extend(retained_pending_updates);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1385,28 +1350,18 @@ impl<'a> DatadirModification<'a> {
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
-
+        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+        for (key, value) in self.pending_updates.drain() {
+            writer.put(key, lsn, &value, ctx).await?;
+        }
+        for key_range in self.pending_deletions.drain(..) {
+            writer.delete(key_range, lsn).await?;
        }

-        if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
-            self.pending_deletions.clear();
-        }
-
-        self.pending_lsns.push(self.lsn);
-        for pending_lsn in self.pending_lsns.drain(..) {
-            // Ideally, we should be able to call writer.finish_write() only once
-            // with the highest LSN. However, the last_record_lsn variable in the
-            // timeline keeps track of the latest LSN and the immediate previous LSN
-            // so we need to record every LSN to not leave a gap between them.
-            writer.finish_write(pending_lsn);
-        }
+        writer.finish_write(lsn);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1415,86 +1370,44 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+    pub(crate) fn is_empty(&self) -> bool {
+        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
    }

    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
+        // Have we already updated the same key? Read the pending updated
        // version in that case.
        //
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::from(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if let Some(value) = self.pending_updates.get(&key) {
+            if let Value::Image(img) = value {
+                Ok(img.clone())
+            } else {
+                // Currently, we never need to read back a WAL record that we
+                // inserted in the same "transaction". All the metadata updates
+                // work directly with Images, and we never need to read actual
+                // data pages. We could handle this if we had to, by calling
+                // the walredo manager, but let's keep it simple for now.
+                Err(PageReconstructError::from(anyhow::anyhow!(
+                    "unexpected pending WAL record"
+                )))
            }
+        } else {
+            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+            self.tline.get(key, lsn, ctx).await
        }
-        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-        self.tline.get(key, lsn, ctx).await
    }

    fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
-        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
-            if *last_lsn == self.lsn {
-                *last_value = val;
-                return;
-            }
-        }
-        values.push((self.lsn, val));
+        self.pending_updates.insert(key, val);
    }

    fn delete(&mut self, key_range: Range<Key>) {
        trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push((key_range, self.lsn));
-    }
-}
-
-/// This struct facilitates accessing either a committed key from the timeline at a
-/// specific LSN, or the latest uncommitted key from a pending modification.
-/// During WAL ingestion, the records from multiple LSNs may be batched in the same
-/// modification before being flushed to the timeline. Hence, the routines in WalIngest
-/// need to look up the keys in the modification first before looking them up in the
-/// timeline to not miss the latest updates.
-#[derive(Clone, Copy)]
-pub enum Version<'a> {
-    Lsn(Lsn),
-    Modified(&'a DatadirModification<'a>),
-}
-
-impl<'a> Version<'a> {
-    async fn get(
-        &self,
-        timeline: &Timeline,
-        key: Key,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        match self {
-            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
-            Version::Modified(modification) => modification.get(key, ctx).await,
-        }
-    }
-
-    fn get_lsn(&self) -> Lsn {
-        match self {
-            Version::Lsn(lsn) => *lsn,
-            Version::Modified(modification) => modification.lsn,
-        }
+        self.pending_deletions.push(key_range);
    }
 }

--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -23,7 +23,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
+    #[allow(clippy::useless_conversion)]
    pub fn blocks(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
@@ -32,7 +32,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
+    #[allow(clippy::useless_conversion)]
    pub fn blocks_available(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
    // else, but that has not been needed in a long time.
    std::env::var("TOKIO_WORKER_THREADS")
        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
 });

 #[derive(Debug, Clone, Copy)]
@@ -258,9 +258,6 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

-    /// See [`crate::tenant::secondary`].
-    SecondaryDownloads,
-
    /// See [`crate::tenant::secondary`].
    SecondaryUploads,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,7 @@
 //!

 use anyhow::{bail, Context};
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
@@ -33,7 +33,6 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -56,7 +55,6 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
-use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -130,13 +128,6 @@ macro_rules! pausable_failpoint {
            .expect("spawn_blocking");
        }
    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
 }

 pub mod blob_io;
@@ -603,9 +594,10 @@ impl Tenant {
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));

        let TenantSharedResources {
@@ -898,7 +890,7 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

-        failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");

        let preload = match preload {
            Some(p) => p,
@@ -1010,7 +1002,7 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
+        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

@@ -1152,9 +1144,10 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
@@ -1766,15 +1759,7 @@ impl Tenant {
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
-                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
-                        .await
-                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
-                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
-                            }
-                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
-                        })?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                }

                self.branch_timeline(
@@ -2043,13 +2028,6 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        // If we're still attaching, fire the cancellation token early to drop out: this
-        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
-        // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
-            self.cancel.cancel();
-        }
-
        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
@@ -2748,10 +2726,6 @@ impl Tenant {
 "#
        .to_string();

-        fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
-        });
-
        // Convert the config to a toml file.
        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;

@@ -2865,7 +2839,9 @@ impl Tenant {
            }
        };

-        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+        crate::failpoint_support::sleep_millis_async!(
+            "gc_iteration_internal_after_getting_gc_timelines"
+        );

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
@@ -3668,6 +3644,140 @@ fn remove_timeline_and_uninit_mark(
    Ok(())
 }

+pub(crate) async fn create_tenant_files(
+    conf: &'static PageServerConf,
+    location_conf: &LocationConf,
+    tenant_shard_id: &TenantShardId,
+) -> anyhow::Result<Utf8PathBuf> {
+    let target_tenant_directory = conf.tenant_path(tenant_shard_id);
+    anyhow::ensure!(
+        !target_tenant_directory
+            .try_exists()
+            .context("check existence of tenant directory")?,
+        "tenant directory already exists",
+    );
+
+    let temporary_tenant_dir =
+        path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
+    debug!("Creating temporary directory structure in {temporary_tenant_dir}");
+
+    // top-level dir may exist if we are creating it through CLI
+    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
+        format!("could not create temporary tenant directory {temporary_tenant_dir}")
+    })?;
+
+    let creation_result = try_create_target_tenant_dir(
+        conf,
+        location_conf,
+        tenant_shard_id,
+        &temporary_tenant_dir,
+        &target_tenant_directory,
+    )
+    .await;
+
+    if creation_result.is_err() {
+        error!(
+            "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data"
+        );
+        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
+            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
+        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
+            error!(
+                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
+            )
+        }
+    }
+
+    creation_result?;
+
+    Ok(target_tenant_directory)
+}
+
+async fn try_create_target_tenant_dir(
+    conf: &'static PageServerConf,
+    location_conf: &LocationConf,
+    tenant_shard_id: &TenantShardId,
+    temporary_tenant_dir: &Utf8Path,
+    target_tenant_directory: &Utf8Path,
+) -> Result<(), anyhow::Error> {
+    let temporary_tenant_timelines_dir = rebase_directory(
+        &conf.timelines_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?;
+    let temporary_legacy_tenant_config_path = rebase_directory(
+        &conf.tenant_config_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_location_config_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config_at(
+        tenant_shard_id,
+        &temporary_tenant_config_path,
+        &temporary_legacy_tenant_config_path,
+        location_conf,
+    )
+    .await?;
+
+    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
+        format!(
+            "create tenant {} temporary timelines directory {}",
+            tenant_shard_id, temporary_tenant_timelines_dir,
+        )
+    })?;
+    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
+        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
+    });
+
+    // Make sure the current tenant directory entries are durable before renaming.
+    // Without this, a crash may reorder any of the directory entry creations above.
+    crashsafe::fsync(temporary_tenant_dir)
+        .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
+
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
+        format!(
+            "move tenant {} temporary directory {} into the permanent one {}",
+            tenant_shard_id, temporary_tenant_dir, target_tenant_directory
+        )
+    })?;
+    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
+        format!(
+            "get tenant {} dir parent for {}",
+            tenant_shard_id, target_tenant_directory,
+        )
+    })?;
+    crashsafe::fsync(target_dir_parent).with_context(|| {
+        format!(
+            "fsync renamed directory's parent {} for tenant {}",
+            target_dir_parent, tenant_shard_id,
+        )
+    })?;
+
+    Ok(())
+}
+
+fn rebase_directory(
+    original_path: &Utf8Path,
+    base: &Utf8Path,
+    new_base: &Utf8Path,
+) -> anyhow::Result<Utf8PathBuf> {
+    let relative_path = original_path.strip_prefix(base).with_context(|| {
+        format!(
+            "Failed to strip base prefix '{}' off path '{}'",
+            base, original_path
+        )
+    })?;
+    Ok(new_base.join(relative_path))
+}
+
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
 async fn run_initdb(
@@ -3762,7 +3872,6 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
    use pageserver_api::shard::ShardIndex;
    use std::fs;
@@ -3830,6 +3939,8 @@ pub(crate) mod harness {
    pub struct TenantHarness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
+        // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id
+        pub(crate) tenant_id: TenantId,
        pub tenant_shard_id: TenantShardId,
        pub generation: Generation,
        pub shard: ShardIndex,
@@ -3891,6 +4002,7 @@ pub(crate) mod harness {
            Ok(Self {
                conf,
                tenant_conf,
+                tenant_id,
                tenant_shard_id,
                generation: Generation::new(0xdeadbeef),
                shard: ShardIndex::unsharded(),
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,8 +46,6 @@ pub mod defaults {
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
                            }
                            break;
                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
                            // This is unexpected: this secondary tenants should not have been created, and we
                            // are not in a position to shut it down from here.
                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -35,7 +35,7 @@ use crate::tenant::config::{
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
-use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -44,7 +44,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
-use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -58,7 +57,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
-    Secondary(Arc<SecondaryTenant>),
+    Secondary,
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
    InProgress(utils::completion::Barrier),
@@ -68,7 +67,7 @@ impl std::fmt::Debug for TenantSlot {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary(_) => write!(f, "Secondary"),
+            Self::Secondary => write!(f, "Secondary"),
            Self::InProgress(_) => write!(f, "InProgress"),
        }
    }
@@ -79,7 +78,7 @@ impl TenantSlot {
    fn get_attached(&self) -> Option<&Arc<Tenant>> {
        match self {
            Self::Attached(t) => Some(t),
-            Self::Secondary(_) => None,
+            Self::Secondary => None,
            Self::InProgress(_) => None,
        }
    }
@@ -131,7 +130,7 @@ impl TenantsMap {

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
+    fn resolve_shard(
        &self,
        tenant_id: &TenantId,
        selector: ShardSelector,
@@ -141,27 +140,25 @@ impl TenantsMap {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
                    match selector {
                        ShardSelector::First => return Some(*slot.0),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return Some(*slot.0)
                        }
                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
+                            if let Some(tenant) = slot.1.get_attached() {
+                                // First slot we see for this tenant, calculate the expected shard number
+                                // for the key: we will use this for checking if this and subsequent
+                                // slots contain the key, rather than recalculating the hash each time.
+                                if want_shard.is_none() {
+                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                                }

-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
+                                if Some(tenant.shard_identity.number) == want_shard {
+                                    return Some(*slot.0);
+                                }
+                            } else {
+                                continue;
                            }
                        }
                        _ => continue,
@@ -467,18 +464,12 @@ pub async fn init_tenant_mgr(
                *gen
            } else {
                match &location_conf.mode {
-                    LocationMode::Secondary(secondary_config) => {
+                    LocationMode::Secondary(_) => {
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                secondary_config,
-                            )),
-                        );
+                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -670,14 +661,8 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {

                            total_attached += 1;
                        }
-                        TenantSlot::Secondary(state) => {
-                            // We don't need to wait for this individually per-tenant: the
-                            // downloader task will be waited on eventually, this cancel
-                            // is just to encourage it to drop out if it is doing work
-                            // for this tenant right now.
-                            state.cancel.cancel();
-
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
+                        TenantSlot::Secondary => {
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -754,6 +739,45 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

+pub(crate) async fn create_tenant(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    tenant_shard_id: TenantShardId,
+    generation: Generation,
+    resources: TenantSharedResources,
+    ctx: &RequestContext,
+) -> Result<Arc<Tenant>, TenantMapInsertError> {
+    let location_conf = LocationConf::attached_single(tenant_conf, generation);
+    info!("Creating tenant at location {location_conf:?}");
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
+    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
+
+    let shard_identity = location_conf.shard;
+    let created_tenant = tenant_spawn(
+        conf,
+        tenant_shard_id,
+        &tenant_path,
+        resources,
+        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
+        None,
+        &TENANTS,
+        SpawnMode::Create,
+        ctx,
+    )?;
+    // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+    //      See https://github.com/neondatabase/neon/issues/4233
+
+    let created_tenant_id = created_tenant.tenant_id();
+    debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id);
+
+    slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?;
+
+    Ok(created_tenant)
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SetNewTenantConfigError {
    #[error(transparent)]
@@ -785,24 +809,6 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum UpsertLocationError {
-    #[error("Bad config request: {0}")]
-    BadRequest(anyhow::Error),
-
-    #[error("Cannot change config in this state: {0}")]
-    Unavailable(#[from] TenantMapError),
-
-    #[error("Tenant is already being modified")]
-    InProgress,
-
-    #[error("Failed to flush: {0}")]
-    Flush(anyhow::Error),
-
-    #[error("Internal error: {0}")]
-    Other(#[from] anyhow::Error),
-}
-
 impl TenantManager {
    /// Convenience function so that anyone with a TenantManager can get at the global configuration, without
    /// having to pass it around everywhere as a separate object.
@@ -839,49 +845,27 @@ impl TenantManager {
            Some(TenantSlot::InProgress(_)) => {
                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
            }
-            None | Some(TenantSlot::Secondary(_)) => {
+            None | Some(TenantSlot::Secondary) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
        }
    }

-    pub(crate) fn get_secondary_tenant_shard(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Option<Arc<SecondaryTenant>> {
-        let locked = self.tenants.read().unwrap();
-
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-            .ok()
-            .flatten();
-
-        match peek_slot {
-            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
-            _ => None,
-        }
-    }
-
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        flush: Option<Duration>,
-        spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<(), anyhow::Error> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

-        enum FastPathModified {
-            Attached(Arc<Tenant>),
-            Secondary(Arc<SecondaryTenant>),
-        }
-
-        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
+        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        let fast_path_taken = {
+        let modify_tenant = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -891,24 +875,16 @@ impl TenantManager {
                        // A transition from Attached to Attached in the same generation, we may
                        // take our fast path and just provide the updated configuration
                        // to the tenant.
-                        tenant.set_new_location_config(
-                            AttachedTenantConf::try_from(new_location_config.clone())
-                                .map_err(UpsertLocationError::BadRequest)?,
-                        );
+                        tenant.set_new_location_config(AttachedTenantConf::try_from(
+                            new_location_config.clone(),
+                        )?);

-                        Some(FastPathModified::Attached(tenant.clone()))
+                        Some(tenant.clone())
                    } else {
                        // Different generations, fall through to general case
                        None
                    }
                }
-                (
-                    LocationMode::Secondary(secondary_conf),
-                    Some(TenantSlot::Secondary(secondary_tenant)),
-                ) => {
-                    secondary_tenant.set_config(secondary_conf);
-                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
-                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
                    None
@@ -917,107 +893,69 @@ impl TenantManager {
        };

        // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of writing config and/or waiting for flush, before returning.
-        match fast_path_taken {
-            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
-
-                // Transition to AttachedStale means we may well hold a valid generation
-                // still, and have been requested to go stale as part of a migration.  If
-                // the caller set `flush`, then flush to remote storage.
-                if let LocationMode::Attached(AttachedLocationConfig {
-                    generation: _,
-                    attach_mode: AttachmentMode::Stale,
-                }) = &new_location_config.mode
-                {
-                    if let Some(flush_timeout) = flush {
-                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                            Ok(Err(e)) => {
-                                return Err(UpsertLocationError::Flush(e));
-                            }
-                            Ok(Ok(_)) => return Ok(Some(tenant)),
-                            Err(_) => {
-                                tracing::warn!(
+        // phase of waiting for flush, before returning.
+        if let Some(tenant) = modify_tenant {
+            // Transition to AttachedStale means we may well hold a valid generation
+            // still, and have been requested to go stale as part of a migration.  If
+            // the caller set `flush`, then flush to remote storage.
+            if let LocationMode::Attached(AttachedLocationConfig {
+                generation: _,
+                attach_mode: AttachmentMode::Stale,
+            }) = &new_location_config.mode
+            {
+                if let Some(flush_timeout) = flush {
+                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                        Ok(Err(e)) => {
+                            return Err(e);
+                        }
+                        Ok(Ok(_)) => return Ok(()),
+                        Err(_) => {
+                            tracing::warn!(
                                timeout_ms = flush_timeout.as_millis(),
                                "Timed out waiting for flush to remote storage, proceeding anyway."
                            )
-                            }
                        }
                    }
                }
+            }

-                return Ok(Some(tenant));
-            }
-            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
-
-                return Ok(None);
-            }
-            None => {
-                // Proceed with the general case procedure, where we will shutdown & remove any existing
-                // slot contents and replace with a fresh one
-            }
-        };
+            return Ok(());
+        }

        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
        // InProgress value to the slot while we make whatever changes are required.  The state for
        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
-            .map_err(|e| match e {
-                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
-                    unreachable!("Called with mode Any")
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+
+        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+            // The case where we keep a Tenant alive was covered above in the special case
+            // for Attached->Attached transitions in the same generation.  By this point,
+            // if we see an attached tenant we know it will be discarded and should be
+            // shut down.
+            let (_guard, progress) = utils::completion::channel();
+
+            match tenant.get_attach_mode() {
+                AttachmentMode::Single | AttachmentMode::Multi => {
+                    // Before we leave our state as the presumed holder of the latest generation,
+                    // flush any outstanding deletions to reduce the risk of leaking objects.
+                    self.resources.deletion_queue_client.flush_advisory()
                }
-                TenantSlotError::InProgress => UpsertLocationError::InProgress,
-                TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s),
-            })?;
-
-        match slot_guard.get_old_value() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // The case where we keep a Tenant alive was covered above in the special case
-                // for Attached->Attached transitions in the same generation.  By this point,
-                // if we see an attached tenant we know it will be discarded and should be
-                // shut down.
-                let (_guard, progress) = utils::completion::channel();
-
-                match tenant.get_attach_mode() {
-                    AttachmentMode::Single | AttachmentMode::Multi => {
-                        // Before we leave our state as the presumed holder of the latest generation,
-                        // flush any outstanding deletions to reduce the risk of leaking objects.
-                        self.resources.deletion_queue_client.flush_advisory()
-                    }
-                    AttachmentMode::Stale => {
-                        // If we're stale there's not point trying to flush deletions
-                    }
-                };
-
-                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
-                    Ok(()) => {}
-                    Err(barrier) => {
-                        info!("Shutdown already in progress, waiting for it to complete");
-                        barrier.wait().await;
-                    }
+                AttachmentMode::Stale => {
+                    // If we're stale there's not point trying to flush deletions
+                }
+            };
+
+            info!("Shutting down attached tenant");
+            match tenant.shutdown(progress, false).await {
+                Ok(()) => {}
+                Err(barrier) => {
+                    info!("Shutdown already in progress, waiting for it to complete");
+                    barrier.wait().await;
                }
-                slot_guard.drop_old_value().expect("We just shut it down");
-            }
-            Some(TenantSlot::Secondary(state)) => {
-                info!("Shutting down secondary tenant");
-                state.shutdown().await;
-            }
-            Some(TenantSlot::InProgress(_)) => {
-                // This should never happen: acquire_slot should error out
-                // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::Other(anyhow::anyhow!(
-                    "Acquired an InProgress slot, this is a bug."
-                )));
-            }
-            None => {
-                // Slot was vacant, nothing needs shutting down.
            }
+            slot_guard.drop_old_value().expect("We just shut it down");
        }

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -1035,12 +973,12 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;

        let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(secondary_config) => {
-                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
-            }
+            LocationMode::Secondary(_) => TenantSlot::Secondary,
            LocationMode::Attached(_attach_config) => {
                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
@@ -1052,7 +990,7 @@ impl TenantManager {
                    shard_identity,
                    None,
                    self.tenants,
-                    spawn_mode,
+                    SpawnMode::Normal,
                    ctx,
                )?;

@@ -1060,20 +998,9 @@ impl TenantManager {
            }
        };

-        let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot {
-            Some(tenant.clone())
-        } else {
-            None
-        };
+        slot_guard.upsert(new_slot)?;

-        slot_guard.upsert(new_slot).map_err(|e| match e {
-            TenantSlotUpsertError::InternalError(e) => {
-                UpsertLocationError::Other(anyhow::anyhow!(e))
-            }
-            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
-        })?;
-
-        Ok(attached_tenant)
+        Ok(())
    }

    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1164,30 +1091,6 @@ impl TenantManager {
                .collect(),
        }
    }
-    // Do some synchronous work for all tenant slots in Secondary state.  The provided
-    // callback should be small and fast, as it will be called inside the global
-    // TenantsMap lock.
-    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
-    where
-        // TODO: let the callback return a hint to drop out of the loop early
-        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
-    {
-        let locked = self.tenants.read().unwrap();
-
-        let map = match &*locked {
-            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
-            TenantsMap::Open(m) => m,
-        };
-
-        for (tenant_id, slot) in map {
-            if let TenantSlot::Secondary(state) = slot {
-                // Only expose secondary tenants that are not currently shutting down
-                if !state.cancel.is_cancelled() {
-                    func(tenant_id, state)
-                }
-            }
-        }
-    }

    pub(crate) async fn delete_tenant(
        &self,
@@ -1302,7 +1205,7 @@ pub(crate) fn get_tenant(
        Some(TenantSlot::InProgress(_)) => {
            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
        }
-        None | Some(TenantSlot::Secondary(_)) => {
+        None | Some(TenantSlot::Secondary) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
    }
@@ -1354,11 +1257,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
        let locked = TENANTS.read().unwrap();

        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
+        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
+            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
+        )?;

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
@@ -1375,7 +1276,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    }
                }
            }
-            Some(TenantSlot::Secondary(_)) => {
+            Some(TenantSlot::Secondary) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                    tenant_id,
                )))
@@ -1639,12 +1540,61 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary(_) => None,
+            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
        .collect())
 }

+/// Execute Attach mgmt API command.
+///
+/// Downloading all the tenant data is performed in the background, this merely
+/// spawns the background task and returns quickly.
+pub(crate) async fn attach_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    generation: Generation,
+    tenant_conf: TenantConfOpt,
+    resources: TenantSharedResources,
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
+    let location_conf = LocationConf::attached_single(tenant_conf, generation);
+    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
+    // TODO: tenant directory remains on disk if we bail out from here on.
+    //       See https://github.com/neondatabase/neon/issues/4233
+
+    let shard_identity = location_conf.shard;
+    let attached_tenant = tenant_spawn(
+        conf,
+        tenant_shard_id,
+        &tenant_dir,
+        resources,
+        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
+        None,
+        &TENANTS,
+        SpawnMode::Normal,
+        ctx,
+    )?;
+    // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+    //      See https://github.com/neondatabase/neon/issues/4233
+
+    let attached_tenant_id = attached_tenant.tenant_id();
+    if tenant_id != attached_tenant_id {
+        return Err(TenantMapInsertError::Other(anyhow::anyhow!(
+            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
+        )));
+    }
+
+    slot_guard.upsert(TenantSlot::Attached(attached_tenant))?;
+    Ok(())
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
    #[error(transparent)]
@@ -1658,7 +1608,7 @@ pub(crate) enum TenantMapInsertError {
 /// Superset of TenantMapError: issues that can occur when acquiring a slot
 /// for a particular tenant ID.
 #[derive(Debug, thiserror::Error)]
-pub(crate) enum TenantSlotError {
+pub enum TenantSlotError {
    /// When acquiring a slot with the expectation that the tenant already exists.
    #[error("Tenant {0} not found")]
    NotFound(TenantShardId),
@@ -1667,6 +1617,9 @@ pub(crate) enum TenantSlotError {
    #[error("tenant {0} already exists, state: {1:?}")]
    AlreadyExists(TenantShardId, TenantState),

+    #[error("tenant {0} already exists in but is not attached")]
+    Conflict(TenantShardId),
+
    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
    #[error("tenant has a state change in progress, try again later")]
@@ -1844,7 +1797,11 @@ impl SlotGuard {
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
+            Some(TenantSlot::Secondary) => {
+                // TODO: when adding secondary mode tenants, this will check for shutdown
+                // in the same way that we do for `Tenant` above
+                true
+            }
            Some(TenantSlot::InProgress(_)) => {
                // A SlotGuard cannot be constructed for a slot that was already InProgress
                unreachable!()
@@ -2054,19 +2011,26 @@ where
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

-    // allow pageserver shutdown to await for our completion
-    let (_guard, progress) = completion::channel();
-
    // The SlotGuard allows us to manipulate the Tenant object without fear of some
    // concurrent API request doing something else for the same tenant ID.
    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(tenant)) => {
+        Some(TenantSlot::Attached(t)) => Some(t),
+        _ => None,
+    };
+
+    // allow pageserver shutdown to await for our completion
+    let (_guard, progress) = completion::channel();
+
+    // If the tenant was attached, shut it down gracefully.  For secondary
+    // locations this part is not necessary
+    match &attached_tenant {
+        Some(attached_tenant) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match attached_tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2075,19 +2039,11 @@ where
                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                }
            }
-            Some(tenant)
        }
-        Some(TenantSlot::Secondary(secondary_state)) => {
-            tracing::info!("Shutting down in secondary mode");
-            secondary_state.shutdown().await;
-            None
+        None => {
+            // Nothing to wait on when not attached, proceed.
        }
-        Some(TenantSlot::InProgress(_)) => {
-            // Acquiring a slot guarantees its old value was not InProgress
-            unreachable!();
-        }
-        None => None,
-    };
+    }

    match tenant_cleanup
        .await
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -229,7 +229,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    TENANT_HEATMAP_BASENAME,
 };

 use utils::id::{TenantId, TimelineId};
@@ -819,25 +818,8 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
-        // Filter out any layers which were not created by this tenant shard.  These are
-        // layers that originate from some ancestor shard after a split, and may still
-        // be referenced by other shards. We are free to delete them locally and remove
-        // them from our index (and would have already done so when we reach this point
-        // in the code), but we may not delete them remotely.
-        with_metadata.retain(|(name, meta)| {
-            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
-                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
-            if !retain {
-                tracing::debug!(
-                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
-                    meta.shard
-                );
-            }
-            retain
-        });
-
        for (name, meta) in &with_metadata {
            info!(
                "scheduling deletion of layer {}{} (shard {})",
@@ -1742,11 +1724,11 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!(
-        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
-    ))
-    .expect("Failed to construct path")
+    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
+        .expect("Failed to construct path")
 }

 /// Given the key of an index, parse out the generation part of the name
@@ -1903,7 +1885,7 @@ mod tests {
        fn span(&self) -> tracing::Span {
            tracing::info_span!(
                "test",
-                tenant_id = %self.harness.tenant_shard_id.tenant_id,
+                tenant_id = %self.harness.tenant_id,
                timeline_id = %TIMELINE_ID
            )
        }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,48 +1,24 @@
-mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
-mod scheduler;

 use std::sync::Arc;

 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};

-use self::{
-    downloader::{downloader_task, SecondaryDetail},
-    heatmap_uploader::heatmap_uploader_task,
-};
+use self::heatmap_uploader::heatmap_uploader_task;

-use super::{config::SecondaryLocationConfig, mgr::TenantManager};
+use super::mgr::TenantManager;

 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, sync::gate::Gate};
+use utils::completion::Barrier;

-enum DownloadCommand {
-    Download(TenantShardId),
-}
 enum UploadCommand {
    Upload(TenantShardId),
 }

-impl UploadCommand {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        match self {
-            Self::Upload(id) => id,
-        }
-    }
-}
-
-impl DownloadCommand {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        match self {
-            Self::Download(id) => id,
-        }
-    }
-}
-
 struct CommandRequest<T> {
    payload: T,
    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -52,73 +28,12 @@ struct CommandResponse {
    result: anyhow::Result<()>,
 }

-// Whereas [`Tenant`] represents an attached tenant, this type represents the work
-// we do for secondary tenant locations: where we are not serving clients or
-// ingesting WAL, but we are maintaining a warm cache of layer files.
-//
-// This type is all about the _download_ path for secondary mode.  The upload path
-// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
-//
-// This structure coordinates TenantManager and SecondaryDownloader,
-// so that the downloader can indicate which tenants it is currently
-// operating on, and the manager can indicate when a particular
-// secondary tenant should cancel any work in flight.
-#[derive(Debug)]
-pub(crate) struct SecondaryTenant {
-    /// Carrying a tenant shard ID simplifies callers such as the downloader
-    /// which need to organize many of these objects by ID.
-    tenant_shard_id: TenantShardId,
-
-    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
-    /// any work for this tenant at the next opportunity.
-    pub(crate) cancel: CancellationToken,
-
-    pub(crate) gate: Gate,
-
-    detail: std::sync::Mutex<SecondaryDetail>,
-}
-
-impl SecondaryTenant {
-    pub(crate) fn new(
-        tenant_shard_id: TenantShardId,
-        config: &SecondaryLocationConfig,
-    ) -> Arc<Self> {
-        Arc::new(Self {
-            tenant_shard_id,
-            // todo: shall we make this a descendent of the
-            // main cancellation token, or is it sufficient that
-            // on shutdown we walk the tenants and fire their
-            // individual cancellations?
-            cancel: CancellationToken::new(),
-            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
-
-            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
-        })
-    }
-
-    pub(crate) async fn shutdown(&self) {
-        self.cancel.cancel();
-
-        // Wait for any secondary downloader work to complete
-        self.gate.close().await;
-    }
-
-    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
-        self.detail.lock().unwrap().config = config.clone();
-    }
-
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        &self.tenant_shard_id
-    }
-}
-
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
-    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }

 impl SecondaryController {
@@ -148,13 +63,6 @@ impl SecondaryController {
        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
            .await
    }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.dispatch(
-            &self.download_req_tx,
-            DownloadCommand::Download(tenant_shard_id),
-        )
-        .await
-    }
 }

 pub fn spawn_tasks(
@@ -163,37 +71,9 @@ pub fn spawn_tasks(
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
 ) -> SecondaryController {
-    let mgr_clone = tenant_manager.clone();
-    let storage_clone = remote_storage.clone();
-    let cancel_clone = cancel.clone();
-    let bg_jobs_clone = background_jobs_can_start.clone();
-
-    let (download_req_tx, download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
-        None,
-        None,
-        "secondary tenant downloads",
-        false,
-        async move {
-            downloader_task(
-                mgr_clone,
-                storage_clone,
-                download_req_rx,
-                bg_jobs_clone,
-                cancel_clone,
-            )
-            .await;
-
-            Ok(())
-        },
-    );
-
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryUploads,
@@ -209,26 +89,16 @@ pub fn spawn_tasks(
                background_jobs_can_start,
                cancel,
            )
-            .await;
-
-            Ok(())
+            .await
        },
    );

-    SecondaryController {
-        download_req_tx,
-        upload_req_tx,
-    }
+    SecondaryController { upload_req_tx }
 }

 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
-    let (download_req_tx, _download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, _upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController {
-        upload_req_tx,
-        download_req_tx,
-    }
+    SecondaryController { upload_req_tx }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1,800 +0,0 @@
-use std::{
-    collections::{HashMap, HashSet},
-    pin::Pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, Instant, SystemTime},
-};
-
-use crate::{
-    config::PageServerConf,
-    metrics::SECONDARY_MODE,
-    tenant::{
-        config::SecondaryLocationConfig,
-        debug_assert_current_span_has_tenant_and_timeline_id,
-        remote_timeline_client::{
-            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
-        },
-        span::debug_assert_current_span_has_tenant_id,
-        storage_layer::LayerFileName,
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-    },
-    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
-};
-
-use super::{
-    heatmap::HeatMapLayer,
-    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
-    SecondaryTenant,
-};
-
-use crate::tenant::{
-    mgr::TenantManager,
-    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
-};
-
-use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
-use pageserver_api::shard::TenantShardId;
-use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
-
-use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
-use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
-};
-
-use super::{
-    heatmap::{HeatMapTenant, HeatMapTimeline},
-    CommandRequest, DownloadCommand,
-};
-
-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
-
-pub(super) async fn downloader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
-
-    let generator = SecondaryDownloader {
-        tenant_manager,
-        remote_storage,
-    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
-
-    scheduler
-        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
-        .await
-}
-
-struct SecondaryDownloader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-}
-
-#[derive(Debug, Clone)]
-pub(super) struct OnDiskState {
-    metadata: LayerFileMetadata,
-    access_time: SystemTime,
-}
-
-impl OnDiskState {
-    fn new(
-        _conf: &'static PageServerConf,
-        _tenant_shard_id: &TenantShardId,
-        _imeline_id: &TimelineId,
-        _ame: LayerFileName,
-        metadata: LayerFileMetadata,
-        access_time: SystemTime,
-    ) -> Self {
-        Self {
-            metadata,
-            access_time,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
-pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
-
-    /// We remember when layers were evicted, to prevent re-downloading them.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
-}
-
-/// This state is written by the secondary downloader, it is opaque
-/// to TenantManager
-#[derive(Debug)]
-pub(super) struct SecondaryDetail {
-    pub(super) config: SecondaryLocationConfig,
-
-    last_download: Option<Instant>,
-    next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
-}
-
-/// Helper for logging SystemTime
-fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
-    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
-    datetime.format("%d/%m/%Y %T")
-}
-
-impl SecondaryDetail {
-    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
-        Self {
-            config,
-            last_download: None,
-            next_download: None,
-            timelines: HashMap::new(),
-        }
-    }
-}
-
-struct PendingDownload {
-    secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
-    target_time: Option<Instant>,
-    period: Option<Duration>,
-}
-
-impl scheduler::PendingJob for PendingDownload {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.secondary_state.get_tenant_shard_id()
-    }
-}
-
-struct RunningDownload {
-    barrier: Barrier,
-}
-
-impl scheduler::RunningJob for RunningDownload {
-    fn get_barrier(&self) -> Barrier {
-        self.barrier.clone()
-    }
-}
-
-struct CompleteDownload {
-    secondary_state: Arc<SecondaryTenant>,
-    completed_at: Instant,
-}
-
-impl scheduler::Completion for CompleteDownload {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.secondary_state.get_tenant_shard_id()
-    }
-}
-
-type Scheduler = TenantBackgroundJobs<
-    SecondaryDownloader,
-    PendingDownload,
-    RunningDownload,
-    CompleteDownload,
-    DownloadCommand,
->;
-
-impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
-    for SecondaryDownloader
-{
-    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
-    fn on_completion(&mut self, completion: CompleteDownload) {
-        let CompleteDownload {
-            secondary_state,
-            completed_at: _completed_at,
-        } = completion;
-
-        tracing::debug!("Secondary tenant download completed");
-
-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
-        let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
-    }
-
-    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
-        let mut result = SchedulingResult {
-            jobs: Vec::new(),
-            want_interval: None,
-        };
-
-        // Step 1: identify some tenants that we may work on
-        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
-        self.tenant_manager
-            .foreach_secondary_tenants(|_id, secondary_state| {
-                tenants.push(secondary_state.clone());
-            });
-
-        // Step 2: filter out tenants which are not yet elegible to run
-        let now = Instant::now();
-        result.jobs = tenants
-            .into_iter()
-            .filter_map(|secondary_tenant| {
-                let (last_download, next_download) = {
-                    let mut detail = secondary_tenant.detail.lock().unwrap();
-
-                    if !detail.config.warm {
-                        // Downloads are disabled for this tenant
-                        detail.next_download = None;
-                        return None;
-                    }
-
-                    if detail.next_download.is_none() {
-                        // Initialize with a jitter: this spreads initial downloads on startup
-                        // or mass-attach across our freshen interval.
-                        let jittered_period =
-                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
-                        detail.next_download = Some(now.checked_add(jittered_period).expect(
-                        "Using our constant, which is known to be small compared with clock range",
-                    ));
-                    }
-                    (detail.last_download, detail.next_download.unwrap())
-                };
-
-                if now < next_download {
-                    Some(PendingDownload {
-                        secondary_state: secondary_tenant,
-                        last_download,
-                        target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
-                    })
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        // Step 3: sort by target execution time to run most urgent first.
-        result.jobs.sort_by_key(|j| j.target_time);
-
-        result
-    }
-
-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
-        let tenant_shard_id = command.get_tenant_shard_id();
-
-        let tenant = self
-            .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            {
-                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-            }
-        };
-
-        Ok(PendingDownload {
-            target_time: None,
-            period: None,
-            last_download: None,
-            secondary_state: tenant,
-        })
-    }
-
-    fn spawn(
-        &mut self,
-        job: PendingDownload,
-    ) -> (
-        RunningDownload,
-        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
-    ) {
-        let PendingDownload {
-            secondary_state,
-            last_download,
-            target_time,
-            period,
-        } = job;
-
-        let (completion, barrier) = utils::completion::channel();
-        let remote_storage = self.remote_storage.clone();
-        let conf = self.tenant_manager.get_conf();
-        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
-        (RunningDownload { barrier }, Box::pin(async move {
-            let _completion = completion;
-
-            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
-                .download()
-                .await
-            {
-                Err(UpdateError::NoData) => {
-                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
-                },
-                Err(UpdateError::NoSpace) => {
-                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
-                }
-                Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
-                },
-                Err(UpdateError::Deserialize(e)) => {
-                    tracing::error!("Corrupt content while downloading tenant: {e}");
-                },
-                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
-                    tracing::error!("Error while downloading tenant: {e}");
-                },
-                Ok(()) => {}
-            };
-
-            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
-
-            // If the job had a target execution time, we may check our final execution
-            // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
-
-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
-            }
-
-            CompleteDownload {
-                    secondary_state,
-                    completed_at: Instant::now(),
-                }
-        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
-    }
-}
-
-/// This type is a convenience to group together the various functions involved in
-/// freshening a secondary tenant.
-struct TenantDownloader<'a> {
-    conf: &'static PageServerConf,
-    remote_storage: &'a GenericRemoteStorage,
-    secondary_state: &'a SecondaryTenant,
-}
-
-/// Errors that may be encountered while updating a tenant
-#[derive(thiserror::Error, Debug)]
-enum UpdateError {
-    #[error("No remote data found")]
-    NoData,
-    #[error("Insufficient local storage space")]
-    NoSpace,
-    #[error("Failed to download")]
-    DownloadError(DownloadError),
-    #[error(transparent)]
-    Deserialize(#[from] serde_json::Error),
-    #[error("Cancelled")]
-    Cancelled,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl From<DownloadError> for UpdateError {
-    fn from(value: DownloadError) -> Self {
-        match &value {
-            DownloadError::Cancelled => Self::Cancelled,
-            DownloadError::NotFound => Self::NoData,
-            _ => Self::DownloadError(value),
-        }
-    }
-}
-
-impl From<std::io::Error> for UpdateError {
-    fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
-            UpdateError::NoSpace
-        } else {
-            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
-            UpdateError::Other(anyhow::anyhow!(value))
-        }
-    }
-}
-
-impl<'a> TenantDownloader<'a> {
-    fn new(
-        conf: &'static PageServerConf,
-        remote_storage: &'a GenericRemoteStorage,
-        secondary_state: &'a SecondaryTenant,
-    ) -> Self {
-        Self {
-            conf,
-            remote_storage,
-            secondary_state,
-        }
-    }
-
-    async fn download(&self) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_id();
-
-        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
-        // cover our access to local storage.
-        let Ok(_guard) = self.secondary_state.gate.enter() else {
-            // Shutting down
-            return Ok(());
-        };
-
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
-            bytes = self.download_heatmap() => {bytes?},
-            _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
-
-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
-        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
-
-        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
-        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
-        let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
-
-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
-
-        // Download the layers in the heatmap
-        for timeline in heatmap.timelines {
-            if self.secondary_state.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline)
-                .instrument(tracing::info_span!(
-                    "secondary_download_timeline",
-                    tenant_id=%tenant_shard_id.tenant_id,
-                    shard_id=%tenant_shard_id.shard_slug(),
-                    %timeline_id
-                ))
-                .await?;
-        }
-
-        Ok(())
-    }
-
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
-        debug_assert_current_span_has_tenant_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
-        // (https://github.com/neondatabase/neon/issues/6199)
-        tracing::debug!("Downloading heatmap for secondary tenant",);
-
-        let heatmap_path = remote_heatmap_path(tenant_shard_id);
-
-        let heatmap_bytes = backoff::retry(
-            || async {
-                let download = self
-                    .remote_storage
-                    .download(&heatmap_path)
-                    .await
-                    .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
-                Ok(heatmap_bytes)
-            },
-            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "download heatmap",
-            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
-                UpdateError::Cancelled
-            }),
-        )
-        .await?;
-
-        SECONDARY_MODE.download_heatmap.inc();
-
-        Ok(heatmap_bytes)
-    }
-
-    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        let timeline_path = self
-            .conf
-            .timeline_path(tenant_shard_id, &timeline.timeline_id);
-
-        // Accumulate updates to the state
-        let mut touched = Vec::new();
-
-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-        }
-
-        // Download heatmap layers that are not present on local disk, or update their
-        // access time if they are already present.
-        for layer in timeline.layers {
-            if self.secondary_state.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            // Existing on-disk layers: just update their access time.
-            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
-                tracing::debug!("Layer {} is already on disk", layer.name);
-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
-                    // We already have this layer on disk.  Update its access time.
-                    tracing::debug!(
-                        "Access time updated for layer {}: {} -> {}",
-                        layer.name,
-                        strftime(&on_disk.access_time),
-                        strftime(&layer.access_time)
-                    );
-                    touched.push(layer);
-                }
-                continue;
-            } else {
-                tracing::debug!("Layer {} not present on disk yet", layer.name);
-            }
-
-            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
-            // recently than it was evicted.
-            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
-                if &layer.access_time > evicted_at {
-                    tracing::info!(
-                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                } else {
-                    tracing::trace!(
-                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                    continue;
-                }
-            }
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(e) => {
-                    if let DownloadError::NotFound = e {
-                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                        // This is harmless: continue to download the next layer. It is expected during compaction
-                        // GC.
-                        tracing::debug!(
-                            "Skipped downloading missing layer {}, raced with compaction/gc?",
-                            layer.name
-                        );
-                        continue;
-                    } else {
-                        return Err(e.into());
-                    }
-                }
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = timeline_path.join(layer.name.to_string());
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
-        }
-
-        // Write updates to state to record layers we just downloaded or touched.
-        {
-            let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
-
-            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline.timeline_id,
-                            t.name,
-                            LayerFileMetadata::from(&t.metadata),
-                            t.access_time,
-                        ));
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
-async fn init_timeline_state(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    heatmap: &HeatMapTimeline,
-) -> SecondaryDetailTimeline {
-    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
-    let mut detail = SecondaryDetailTimeline::default();
-
-    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
-        Ok(d) => d,
-        Err(e) => {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                let context = format!("Creating timeline directory {timeline_path}");
-                tracing::info!("{}", context);
-                tokio::fs::create_dir_all(&timeline_path)
-                    .await
-                    .fatal_err(&context);
-
-                // No entries to report: drop out.
-                return detail;
-            } else {
-                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
-            }
-        }
-    };
-
-    // As we iterate through layers found on disk, we will look up their metadata from this map.
-    // Layers not present in metadata will be discarded.
-    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
-
-    while let Some(dentry) = dir
-        .next_entry()
-        .await
-        .fatal_err(&format!("Listing {timeline_path}"))
-    {
-        let dentry_file_name = dentry.file_name();
-        let file_name = dentry_file_name.to_string_lossy();
-        let local_meta = dentry.metadata().await.fatal_err(&format!(
-            "Read metadata on {}",
-            dentry.path().to_string_lossy()
-        ));
-
-        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-        if file_name == METADATA_FILE_NAME {
-            continue;
-        }
-
-        match LayerFileName::from_str(&file_name) {
-            Ok(name) => {
-                let remote_meta = heatmap_metadata.get(&name);
-                match remote_meta {
-                    Some(remote_meta) => {
-                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
-                                "Removing local layer {name} with unexpected local size {} != {}",
-                                local_meta.len(),
-                                remote_meta.metadata.file_size
-                            );
-                        } else {
-                            // We expect the access time to be initialized immediately afterwards, when
-                            // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
-                                name.clone(),
-                                OnDiskState::new(
-                                    conf,
-                                    tenant_shard_id,
-                                    &heatmap.timeline_id,
-                                    name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
-                                    remote_meta.access_time,
-                                ),
-                            );
-                        }
-                    }
-                    None => {
-                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
-                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
-                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
-                        tracing::info!(
-                            "Removing secondary local layer {} because it's absent in heatmap",
-                            name
-                        );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
-                    }
-                }
-            }
-            Err(_) => {
-                // Ignore it.
-                tracing::warn!("Unexpected file in timeline directory: {file_name}");
-            }
-        }
-    }
-
-    detail
-}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,6 +1,5 @@
 use std::{
    collections::HashMap,
-    pin::Pin,
    sync::{Arc, Weak},
    time::{Duration, Instant},
 };
@@ -8,86 +7,35 @@ use std::{
 use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
-        config::AttachmentMode,
-        mgr::TenantManager,
-        remote_timeline_client::remote_heatmap_path,
-        span::debug_assert_current_span_has_tenant_id,
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-        Tenant,
+        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
+        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
    },
 };

-use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::GenericRemoteStorage;

-use super::{
-    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
-};
+use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use tracing::instrument;
+use utils::{backoff, completion::Barrier};

-use super::{heatmap::HeatMapTenant, UploadCommand};
+use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};

-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) {
-    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let generator = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tenants: HashMap::new(),
-    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
-
-    scheduler
-        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
-        .await
-}
-
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
-
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-}
+/// Period between heatmap uploader walking Tenants to look for work to do.
+/// If any tenants have a heatmap upload period lower than this, it will be adjusted
+/// downward to match.
+const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);

 struct WriteInProgress {
    barrier: Barrier,
 }

-impl RunningJob for WriteInProgress {
-    fn get_barrier(&self) -> Barrier {
-        self.barrier.clone()
-    }
-}
-
 struct UploadPending {
    tenant: Arc<Tenant>,
    last_digest: Option<md5::Digest>,
-    target_time: Option<Instant>,
-    period: Option<Duration>,
-}
-
-impl scheduler::PendingJob for UploadPending {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.tenant.get_tenant_shard_id()
-    }
 }

 struct WriteComplete {
@@ -97,12 +45,6 @@ struct WriteComplete {
    next_upload: Option<Instant>,
 }

-impl scheduler::Completion for WriteComplete {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        &self.tenant_shard_id
-    }
-}
-
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -126,110 +68,267 @@ struct UploaderTenantState {
    next_upload: Option<Instant>,
 }

-type Scheduler = TenantBackgroundJobs<
-    HeatmapUploader,
-    UploadPending,
-    WriteInProgress,
-    WriteComplete,
-    UploadCommand,
->;
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,

-impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
-    for HeatmapUploader
-{
-    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+
+    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
+    /// limits permit it.
+    tenants_pending: std::collections::VecDeque<UploadPending>,
+
+    /// Tenants for which a task in `tasks` has been spawned.
+    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
+
+    tasks: JoinSet<()>,
+
+    /// Channel for our child tasks to send results to: we use a channel for results rather than
+    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
+    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
+    /// behavior.
+    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
+    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
+
+    concurrent_uploads: usize,
+
+    scheduling_interval: Duration,
+}
+
+/// The uploader task runs a loop that periodically wakes up and schedules tasks for
+/// tenants that require an upload, or handles any commands that have been sent into
+/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
+/// spawn.
+///
+/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
+/// all tenants that require an upload, and in between scheduling iterations we will
+/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
+///
+/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
+/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
+/// we might block waiting on a Tenant.
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+    let mut uploader = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tasks: JoinSet::new(),
+        tenants: HashMap::new(),
+        tenants_pending: std::collections::VecDeque::new(),
+        tenants_uploading: HashMap::new(),
+        task_result_tx: result_tx,
+        task_result_rx: result_rx,
+        concurrent_uploads,
+        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        // Look for new work: this is relatively expensive because we have to go acquire the lock on
+        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+        // require an upload.
+        uploader.schedule_iteration().await?;
+
+        // Between scheduling iterations, we will:
+        //  - Drain any complete tasks and spawn pending tasks
+        //  - Handle incoming administrative commands
+        //  - Check our cancellation token
+        let next_scheduling_iteration = Instant::now()
+            .checked_add(uploader.scheduling_interval)
+            .unwrap_or_else(|| {
+                tracing::warn!(
+                    "Scheduling interval invalid ({}s), running immediately!",
+                    uploader.scheduling_interval.as_secs_f64()
+                );
+                Instant::now()
+            });
+        loop {
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                    tracing::info!("Heatmap uploader joining tasks");
+                    while let Some(_r) = uploader.tasks.join_next().await {};
+                    tracing::info!("Heatmap uploader terminating");
+
+                    break;
+                },
+                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
+                    break;},
+                cmd = command_queue.recv() => {
+                    tracing::debug!("heatmap_uploader_task: woke for command queue");
+                    let cmd = match cmd {
+                        Some(c) =>c,
+                        None => {
+                            // SecondaryController was destroyed, and this has raced with
+                            // our CancellationToken
+                            tracing::info!("Heatmap uploader terminating");
+                            cancel.cancel();
+                            break;
+                        }
+                    };
+
+                    let CommandRequest{
+                        response_tx,
+                        payload
+                    } = cmd;
+                    uploader.handle_command(payload, response_tx);
+                },
+                _ = uploader.process_next_completion() => {
+                    if !cancel.is_cancelled() {
+                        uploader.spawn_pending();
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+impl HeatmapUploader {
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
        // Cull any entries in self.tenants whose Arc<Tenant> is gone
        self.tenants
            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());

+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.tenants_pending.clear();
+
+        // Used a fixed 'now' through the following loop, for efficiency and fairness.
        let now = Instant::now();

-        let mut result = SchedulingResult {
-            jobs: Vec::new(),
-            want_interval: None,
-        };
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        const YIELD_ITERATIONS: usize = 1000;

+        // Iterate over tenants looking for work to do.
        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-
-        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
-            let period = match tenant.get_heatmap_period() {
-                None => {
-                    // Heatmaps are disabled for this tenant
-                    return;
-                }
-                Some(period) => {
-                    // If any tenant has asked for uploads more frequent than our scheduling interval,
-                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                    // we may set rather short intervals.
-                    result.want_interval = match result.want_interval {
-                        None => Some(period),
-                        Some(existing) => Some(std::cmp::min(period, existing)),
-                    };
-
-                    period
-                }
-            };
-
-            // Stale attachments do not upload anything: if we are in this state, there is probably some
-            // other attachment in mode Single or Multi running on another pageserver, and we don't
-            // want to thrash and overwrite their heatmap uploads.
-            if tenant.get_attach_mode() == AttachmentMode::Stale {
-                return;
+        for (i, tenant) in tenants.into_iter().enumerate() {
+            // Process is shutting down, drop out
+            if self.cancel.is_cancelled() {
+                return Ok(());
            }

-            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-            // with the completion time in on_completion.
-            let state = self
-                .tenants
-                .entry(*tenant.get_tenant_shard_id())
-                .or_insert_with(|| {
-                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
-
-                    UploaderTenantState {
-                        tenant: Arc::downgrade(&tenant),
-                        last_upload: None,
-                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
-                        last_digest: None,
-                    }
-                });
-
-            // Decline to do the upload if insufficient time has passed
-            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
-                return;
+            // Skip tenants that already have a write in flight
+            if self
+                .tenants_uploading
+                .contains_key(tenant.get_tenant_shard_id())
+            {
+                continue;
            }

-            let last_digest = state.last_digest;
-            result.jobs.push(UploadPending {
-                tenant,
-                last_digest,
-                target_time: state.next_upload,
-                period: Some(period),
-            });
-        })
-        .await
-        .ok();
+            self.maybe_schedule_upload(&now, tenant);

-        result
+            if i + 1 % YIELD_ITERATIONS == 0 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Spawn tasks for as many of our pending tenants as we can.
+        self.spawn_pending();
+
+        Ok(())
    }

-    fn spawn(
-        &mut self,
-        job: UploadPending,
-    ) -> (
-        WriteInProgress,
-        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
-    ) {
-        let UploadPending {
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) {
+        match self.task_result_rx.recv().await {
+            Some(r) => {
+                self.on_completion(r);
+            }
+            None => {
+                unreachable!("Result sender is stored on Self");
+            }
+        }
+    }
+
+    /// The 'maybe' refers to the tenant's state: whether it is configured
+    /// for heatmap uploads at all, and whether sufficient time has passed
+    /// since the last upload.
+    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
+        match tenant.get_heatmap_period() {
+            None => {
+                // Heatmaps are disabled for this tenant
+                return;
+            }
+            Some(period) => {
+                // If any tenant has asked for uploads more frequent than our scheduling interval,
+                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                // we may set rather short intervals.
+                if period < self.scheduling_interval {
+                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
+                }
+            }
+        }
+
+        // Stale attachments do not upload anything: if we are in this state, there is probably some
+        // other attachment in mode Single or Multi running on another pageserver, and we don't
+        // want to thrash and overwrite their heatmap uploads.
+        if tenant.get_attach_mode() == AttachmentMode::Stale {
+            return;
+        }
+
+        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+        // with the completion time in on_completion.
+        let state = self
+            .tenants
+            .entry(*tenant.get_tenant_shard_id())
+            .or_insert_with(|| UploaderTenantState {
+                tenant: Arc::downgrade(&tenant),
+                last_upload: None,
+                next_upload: Some(Instant::now()),
+                last_digest: None,
+            });
+
+        // Decline to do the upload if insufficient time has passed
+        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
+            return;
+        }
+
+        let last_digest = state.last_digest;
+        self.tenants_pending.push_back(UploadPending {
            tenant,
            last_digest,
-            target_time,
-            period,
-        } = job;
+        })
+    }

+    fn spawn_pending(&mut self) {
+        while !self.tenants_pending.is_empty()
+            && self.tenants_uploading.len() < self.concurrent_uploads
+        {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.tenants_pending.pop_front().unwrap();
+            self.spawn_upload(pending.tenant, pending.last_digest);
+        }
+    }
+
+    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
        let remote_storage = self.remote_storage.clone();
-        let (completion, barrier) = utils::completion::channel();
        let tenant_shard_id = *tenant.get_tenant_shard_id();
-        (WriteInProgress { barrier }, Box::pin(async move {
+        let (completion, barrier) = utils::completion::channel();
+        let result_tx = self.task_result_tx.clone();
+        self.tasks.spawn(async move {
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

@@ -263,47 +362,22 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            };

            let now = Instant::now();
-
-            // If the job had a target execution time, we may check our final execution
-            // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Elapsed time includes any scheduling lag as well as the execution of the job
-                let elapsed = now.duration_since(target_time);
-
-                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
-            }
-
            let next_upload = tenant
                .get_heatmap_period()
                .and_then(|period| now.checked_add(period));

-            WriteComplete {
+            result_tx
+                .send(WriteComplete {
                    tenant_shard_id: *tenant.get_tenant_shard_id(),
                    completed_at: now,
                    digest,
                    next_upload,
-                }
-        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
-    }
+                })
+                .ok();
+        });

-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
-        let tenant_shard_id = command.get_tenant_shard_id();
-
-        tracing::info!(
-            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-            "Starting heatmap write on command");
-        let tenant = self
-            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        Ok(UploadPending {
-            // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
-            tenant,
-            target_time: None,
-            period: None,
-        })
+        self.tenants_uploading
+            .insert(tenant_shard_id, WriteInProgress { barrier });
    }

    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -315,6 +389,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            digest,
            next_upload,
        } = completion;
+        self.tenants_uploading.remove(&tenant_shard_id);
        use std::collections::hash_map::Entry;
        match self.tenants.entry(tenant_shard_id) {
            Entry::Vacant(_) => {
@@ -327,6 +402,69 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            }
        }
    }
+
+    fn handle_command(
+        &mut self,
+        command: UploadCommand,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        match command {
+            UploadCommand::Upload(tenant_shard_id) => {
+                // If an upload was ongoing for this tenant, let it finish first.
+                let barrier = if let Some(writing_state) =
+                    self.tenants_uploading.get(&tenant_shard_id)
+                {
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap write to complete");
+                    writing_state.barrier.clone()
+                } else {
+                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
+                    // starting of other background work.
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Starting heatmap write on command");
+                    let tenant = match self
+                        .tenant_manager
+                        .get_attached_tenant_shard(tenant_shard_id, true)
+                    {
+                        Ok(t) => t,
+                        Err(e) => {
+                            // Drop result of send: we don't care if caller dropped their receiver
+                            drop(response_tx.send(CommandResponse {
+                                result: Err(e.into()),
+                            }));
+                            return;
+                        }
+                    };
+                    self.spawn_upload(tenant, None);
+                    let writing_state = self
+                        .tenants_uploading
+                        .get(&tenant_shard_id)
+                        .expect("We just inserted this");
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap upload to complete");
+
+                    writing_state.barrier.clone()
+                };
+
+                // This task does no I/O: it only listens for a barrier's completion and then
+                // sends to the command response channel.  It is therefore safe to spawn this without
+                // any gates/task_mgr hooks.
+                tokio::task::spawn(async move {
+                    barrier.wait().await;
+
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Heatmap upload complete");
+
+                    // Drop result of send: we don't care if caller dropped their receiver
+                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
+                });
+            }
+        }
+    }
 }

 enum UploadHeatmapOutcome {
@@ -349,6 +487,7 @@ enum UploadHeatmapError {

 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
+#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,359 +0,0 @@
-use futures::Future;
-use std::{
-    collections::HashMap,
-    marker::PhantomData,
-    pin::Pin,
-    time::{Duration, Instant},
-};
-
-use pageserver_api::shard::TenantShardId;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, yielding_loop::yielding_loop};
-
-use super::{CommandRequest, CommandResponse};
-
-/// Scheduling interval is the time between calls to JobGenerator::schedule.
-/// When we schedule jobs, the job generator may provide a hint of its preferred
-/// interval, which we will respect within these intervals.
-const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
-
-/// Scheduling helper for background work across many tenants.
-///
-/// Systems that need to run background work across many tenants may use this type
-/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
-/// implementation to provide the work to execute.  This is a simple scheduler that just
-/// polls the generator for outstanding work, replacing its queue of pending work with
-/// what the generator yields on each call: the job generator can change its mind about
-/// the order of jobs between calls.  The job generator is notified when jobs complete,
-/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
-/// admin APIs).
-///
-/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
-///
-/// G: A JobGenerator that this scheduler will poll to find pending jobs
-/// PJ: 'Pending Job': type for job descriptors that are ready to run
-/// RJ: 'Running Job' type' for jobs that have been spawned
-/// C : 'Completion' type that spawned jobs will send when they finish
-/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
-pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
-where
-    G: JobGenerator<PJ, RJ, C, CMD>,
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-{
-    generator: G,
-
-    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
-    /// be removed on next scheduling pass.
-    pending: std::collections::VecDeque<PJ>,
-
-    /// Tasks currently running in Self::tasks for these tenants.  Check this map
-    /// before pushing more work into pending for the same tenant.
-    running: HashMap<TenantShardId, RJ>,
-
-    tasks: JoinSet<C>,
-
-    concurrency: usize,
-
-    /// How often we would like schedule_interval to be called.
-    pub(super) scheduling_interval: Duration,
-
-    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
-}
-
-pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
-where
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-{
-    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
-    ///
-    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
-    /// Implementations should take care to yield the executor periodically if running
-    /// very long loops.
-    ///
-    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
-    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
-    /// and re-generated.
-    async fn schedule(&mut self) -> SchedulingResult<PJ>;
-
-    /// Called when a pending job is ready to be run.
-    ///
-    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
-    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
-
-    /// Called when a job previously spawned with spawn() transmits its completion
-    fn on_completion(&mut self, completion: C);
-
-    /// Called when a command is received.  A job will be spawned immediately if the return
-    /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
-}
-
-/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
-pub(super) struct SchedulingResult<PJ> {
-    pub(super) jobs: Vec<PJ>,
-    /// The job generator would like to be called again this soon
-    pub(super) want_interval: Option<Duration>,
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait PendingJob {
-    fn get_tenant_shard_id(&self) -> &TenantShardId;
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait Completion: Send + 'static {
-    fn get_tenant_shard_id(&self) -> &TenantShardId;
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait RunningJob {
-    fn get_barrier(&self) -> Barrier;
-}
-
-impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
-where
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-    G: JobGenerator<PJ, RJ, C, CMD>,
-{
-    pub(super) fn new(generator: G, concurrency: usize) -> Self {
-        Self {
-            generator,
-            pending: std::collections::VecDeque::new(),
-            running: HashMap::new(),
-            tasks: JoinSet::new(),
-            concurrency,
-            scheduling_interval: MAX_SCHEDULING_INTERVAL,
-            _phantom: PhantomData,
-        }
-    }
-
-    pub(super) async fn run(
-        &mut self,
-        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
-        background_jobs_can_start: Barrier,
-        cancel: CancellationToken,
-    ) {
-        tracing::info!("Waiting for background_jobs_can start...");
-        background_jobs_can_start.wait().await;
-        tracing::info!("background_jobs_can is ready, proceeding.");
-
-        while !cancel.is_cancelled() {
-            // Look for new work: this is relatively expensive because we have to go acquire the lock on
-            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-            // require an upload.
-            self.schedule_iteration(&cancel).await;
-
-            if cancel.is_cancelled() {
-                return;
-            }
-
-            // Schedule some work, if concurrency limit permits it
-            self.spawn_pending();
-
-            // Between scheduling iterations, we will:
-            //  - Drain any complete tasks and spawn pending tasks
-            //  - Handle incoming administrative commands
-            //  - Check our cancellation token
-            let next_scheduling_iteration = Instant::now()
-                .checked_add(self.scheduling_interval)
-                .unwrap_or_else(|| {
-                    tracing::warn!(
-                        "Scheduling interval invalid ({}s)",
-                        self.scheduling_interval.as_secs_f64()
-                    );
-                    // unwrap(): this constant is small, cannot fail to add to time unless
-                    // we are close to the end of the universe.
-                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
-                });
-            loop {
-                tokio::select! {
-                    _ = cancel.cancelled() => {
-                        tracing::info!("joining tasks");
-                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                        // It is the callers responsibility to make sure that the tasks they scheduled
-                        // respect an appropriate cancellation token, to shut down promptly.  It is only
-                        // safe to wait on joining these tasks because we can see the cancellation token
-                        // has been set.
-                        while let Some(_r) = self.tasks.join_next().await {}
-                        tracing::info!("terminating on cancellation token.");
-
-                        break;
-                    },
-                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                        tracing::debug!("woke for scheduling interval");
-                        break;},
-                    cmd = command_queue.recv() => {
-                        tracing::debug!("woke for command queue");
-                        let cmd = match cmd {
-                            Some(c) =>c,
-                            None => {
-                                // SecondaryController was destroyed, and this has raced with
-                                // our CancellationToken
-                                tracing::info!("terminating on command queue destruction");
-                                cancel.cancel();
-                                break;
-                            }
-                        };
-
-                        let CommandRequest{
-                            response_tx,
-                            payload
-                        } = cmd;
-                        self.handle_command(payload, response_tx);
-                    },
-                    _ = async {
-                        let completion = self.process_next_completion().await;
-                        match completion {
-                            Some(c) => {
-                                self.generator.on_completion(c);
-                                if !cancel.is_cancelled() {
-                                    self.spawn_pending();
-                                }
-                            },
-                            None => {
-                                // Nothing is running, so just wait: expect that this future
-                                // will be dropped when something in the outer select! fires.
-                                cancel.cancelled().await;
-                            }
-                        }
-
-                     } => {}
-                }
-            }
-        }
-    }
-
-    fn do_spawn(&mut self, job: PJ) {
-        let tenant_shard_id = *job.get_tenant_shard_id();
-        let (in_progress, fut) = self.generator.spawn(job);
-
-        self.tasks.spawn(fut);
-
-        self.running.insert(tenant_shard_id, in_progress);
-    }
-
-    /// For all pending tenants that are elegible for execution, spawn their task.
-    ///
-    /// Caller provides the spawn operation, we track the resulting execution.
-    fn spawn_pending(&mut self) {
-        while !self.pending.is_empty() && self.running.len() < self.concurrency {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
-        }
-    }
-
-    /// For administrative commands: skip the pending queue, ignore concurrency limits
-    fn spawn_now(&mut self, job: PJ) -> &RJ {
-        let tenant_shard_id = *job.get_tenant_shard_id();
-        self.do_spawn(job);
-        self.running
-            .get(&tenant_shard_id)
-            .expect("We just inserted this")
-    }
-
-    /// Wait until the next task completes, and handle its completion
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) -> Option<C> {
-        match self.tasks.join_next().await {
-            Some(r) => {
-                // We use a channel to drive completions, but also
-                // need to drain the JoinSet to avoid completed tasks
-                // accumulating.  These calls are 1:1 because every task
-                // we spawn into this joinset submits is result to the channel.
-                let completion = r.expect("Panic in background task");
-
-                self.running.remove(completion.get_tenant_shard_id());
-                Some(completion)
-            }
-            None => {
-                // Nothing is running, so we have nothing to wait for.  We may drop out: the
-                // main even loop will call us again after the next time it has run something.
-                None
-            }
-        }
-    }
-
-    /// Convert the command into a pending job, spawn it, and when the spawned
-    /// job completes, send the result down `response_tx`.
-    fn handle_command(
-        &mut self,
-        cmd: CMD,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        let job = match self.generator.on_command(cmd) {
-            Ok(j) => j,
-            Err(e) => {
-                response_tx.send(CommandResponse { result: Err(e) }).ok();
-                return;
-            }
-        };
-
-        let tenant_shard_id = job.get_tenant_shard_id();
-        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            barrier
-        } else {
-            let running = self.spawn_now(job);
-            running.get_barrier().clone()
-        };
-
-        // This task does no I/O: it only listens for a barrier's completion and then
-        // sends to the command response channel.  It is therefore safe to spawn this without
-        // any gates/task_mgr hooks.
-        tokio::task::spawn(async move {
-            barrier.wait().await;
-
-            response_tx.send(CommandResponse { result: Ok(()) }).ok();
-        });
-    }
-
-    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
-        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
-    }
-
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
-    ///
-    /// This function resets the pending list: it is assumed that the caller may change their mind about
-    /// which tenants need work between calls to schedule_iteration.
-    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
-        let SchedulingResult {
-            jobs,
-            want_interval,
-        } = self.generator.schedule().await;
-
-        // Adjust interval based on feedback from the job generator
-        if let Some(want_interval) = want_interval {
-            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
-            self.scheduling_interval = Duration::from_secs(std::cmp::min(
-                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
-                MAX_SCHEDULING_INTERVAL.as_secs(),
-            ));
-        }
-
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.pending.clear();
-
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
-            // Skip tenants that already have a write in flight
-            if !self.running.contains_key(job.get_tenant_shard_id()) {
-                self.pending.push_back(job);
-            }
-        })
-        .await
-        .ok();
-    }
-}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -320,8 +320,8 @@ impl DeltaLayer {
            .metadata()
            .context("get file metadata to determine size")?;

-        // This function is never used for constructing layers in a running pageserver,
-        // so it does not need an accurate TenantShardId.
+        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
+        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);

        Ok(DeltaLayer {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -278,8 +278,8 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;

-        // This function is never used for constructing layers in a running pageserver,
-        // so it does not need an accurate TenantShardId.
+        // TODO(sharding): we should get TenantShardId from path.
+        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);

        Ok(ImageLayer {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;

 use super::{DeltaLayerWriter, ResidentLayer};

@@ -246,43 +246,16 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub(crate) async fn put_value(
+    pub async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
        val: &Value,
        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+        let inner: &mut _ = &mut *self.inner.write().await;
+        self.assert_writable();

        let off = {
            // Avoid doing allocations for "small" values.
@@ -291,7 +264,7 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            locked_inner
+            inner
                .file
                .write_blob(
                    &buf,
@@ -302,7 +275,7 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = locked_inner.index.entry(key).or_default();
+        let vec_map = inner.index.entry(key).or_default();
        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -312,11 +285,13 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
+
        Ok(())
    }

+    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -945,18 +945,8 @@ impl LayerInner {
            Ok((Err(e), _permit)) => {
                // sleep already happened in the spawned task, if it was not cancelled
                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
-
-                match e.downcast_ref::<remote_storage::DownloadError>() {
-                    // If the download failed due to its cancellation token,
-                    // propagate the cancellation error upstream.
-                    Some(remote_storage::DownloadError::Cancelled) => {
-                        Err(DownloadError::DownloadCancelled)
-                    }
-                    _ => {
-                        tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                        Err(DownloadError::DownloadFailed)
-                    }
-                }
+                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+                Err(DownloadError::DownloadFailed)
            }
            Err(_gone) => Err(DownloadError::DownloadCancelled),
        }
@@ -1128,7 +1118,6 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
-                timeline.metrics.evictions.inc();
                timeline
                    .metrics
                    .resident_physical_size_sub(self.desc.file_size);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,8 +45,6 @@ pub(crate) enum BackgroundLoopKind {
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
-    HeatmapUpload,
-    SecondaryDownload,
 }

 impl BackgroundLoopKind {
@@ -65,11 +63,6 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
        .with_label_values(&[loop_kind.as_static_str()])
        .guard();

-    pausable_failpoint!(
-        "initial-size-calculation-permit-pause",
-        loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
-    );
-
    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
        Ok(permit) => permit,
        Err(_closed) => unreachable!("we never close the semaphore"),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -373,20 +373,15 @@ pub struct GcInfo {
 }

 /// An error happened in a get() operation.
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum PageReconstructError {
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
    #[error(transparent)]
    Other(#[from] anyhow::Error),

-    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
-
    /// The operation was cancelled
-    #[error("Cancelled")]
    Cancelled,

    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
    AncestorStopping(TimelineId),

    /// An error happened replaying WAL records
@@ -407,6 +402,32 @@ enum FlushLayerError {
    Other(#[from] anyhow::Error),
 }

+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -431,21 +452,6 @@ impl std::fmt::Debug for Timeline {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum WaitLsnError {
-    // Called on a timeline which is shutting down
-    #[error("Shutdown")]
-    Shutdown,
-
-    // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
-
-    // Timeout expired while waiting for LSN to catch up with goal.
-    #[error("{0}")]
-    Timeout(String),
-}
-
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -480,7 +486,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get(
+    pub async fn get(
        &self,
        key: Key,
        lsn: Lsn,
@@ -490,11 +496,6 @@ impl Timeline {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

-        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
-        // already checked the key against the shard_identity when looking up the Timeline from
-        // page_service.
-        debug_assert!(!self.shard_identity.is_key_disposable(&key));
-
        // XXX: structured stats collection for layer eviction here.
        trace!(
            "get page request for {}@{} from task kind {:?}",
@@ -628,28 +629,24 @@ impl Timeline {
    /// You should call this before any of the other get_* or list_* functions. Calling
    /// those functions with an LSN that has been processed yet is an error.
    ///
-    pub(crate) async fn wait_lsn(
+    pub async fn wait_lsn(
        &self,
        lsn: Lsn,
        _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
-            return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
-        }
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");

        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -663,22 +660,18 @@ impl Timeline {
        {
            Ok(()) => Ok(()),
            Err(e) => {
-                use utils::seqwait::SeqWaitError::*;
-                match e {
-                    Shutdown => Err(WaitLsnError::Shutdown),
-                    Timeout => {
-                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                        drop(_timer);
-                        let walreceiver_status = self.walreceiver_status();
-                        Err(WaitLsnError::Timeout(format!(
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                drop(_timer);
+                let walreceiver_status = self.walreceiver_status();
+                Err(anyhow::Error::new(e).context({
+                    format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                        lsn,
                        self.get_last_record_lsn(),
                        self.get_disk_consistent_lsn(),
                        walreceiver_status,
-                    )))
-                    }
-                }
+                    )
+                }))
            }
        }
    }
@@ -1466,7 +1459,6 @@ impl Timeline {
                max_lsn_wal_lag,
                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                availability_zone: self.conf.availability_zone.clone(),
-                ingest_batch_size: self.conf.ingest_batch_size,
            },
            broker_client,
            ctx,
@@ -2231,13 +2223,13 @@ impl Timeline {
                    return Err(layer_traversal_error(
                        if cfg!(test) {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
+                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                            )
                        } else {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
+                                "could not find data for key {} at LSN {}, for request at LSN {}",
+                                key, cont_lsn, request_lsn
                            )
                        },
                        traversal_path,
@@ -2297,12 +2289,11 @@ impl Timeline {
                ancestor
                    .wait_lsn(timeline.ancestor_lsn, ctx)
                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
+                    .with_context(|| {
+                        format!(
+                            "wait for lsn {} on ancestor timeline_id={}",
+                            timeline.ancestor_lsn, ancestor.timeline_id
+                        )
                    })?;

                timeline_owned = ancestor;
@@ -2480,27 +2471,9 @@ impl Timeline {
        Ok(())
    }

-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
+    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_tombstone(key_range, lsn).await?;
        Ok(())
    }

@@ -3062,15 +3035,6 @@ impl Timeline {
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
                        let img = match self.get(key, lsn, ctx).await {
                            Ok(img) => img,
                            Err(err) => {
@@ -3097,7 +3061,6 @@ impl Timeline {
                                }
                            }
                        };
-
                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
@@ -3131,13 +3094,11 @@ impl Timeline {
            .await
            .context("fsync of newly created layer files")?;

-        if !all_paths.is_empty() {
-            par_fsync::par_fsync_async(&[self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
-            .await
-            .context("fsync of timeline dir")?;
-        }
+        par_fsync::par_fsync_async(&[self
+            .conf
+            .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
+        .await
+        .context("fsync of timeline dir")?;

        let mut guard = self.layers.write().await;

@@ -3670,15 +3631,7 @@ impl Timeline {
                )))
            });

-            if !self.shard_identity.is_key_disposable(&key) {
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
+            writer.as_mut().unwrap().put_value(key, lsn, value).await?;

            if !new_layers.is_empty() {
                fail_point!("after-timeline-compacted-first-L1");
@@ -4233,7 +4186,7 @@ impl Timeline {
                    .context("Failed to reconstruct a page image:")
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(e) => return Err(PageReconstructError::from(e)),
                };

                if img.len() == page_cache::PAGE_SZ {
@@ -4576,16 +4529,8 @@ impl<'a> TimelineWriter<'a> {
        self.tl.put_value(key, lsn, value, ctx).await
    }

-    pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
-    }
-
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        self.tl.put_tombstone(key_range, lsn).await
    }

    /// Track the end of the latest digested WAL record.
@@ -4596,11 +4541,11 @@ impl<'a> TimelineWriter<'a> {
    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
-    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
+    pub fn finish_write(&self, new_lsn: Lsn) {
        self.tl.finish_write(new_lsn);
    }

-    pub(crate) fn update_current_logical_size(&self, delta: i64) {
+    pub fn update_current_logical_size(&self, delta: i64) {
        self.tl.update_current_logical_size(delta)
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,7 +58,6 @@ pub struct WalReceiverConf {
    pub max_lsn_wal_lag: NonZeroU64,
    pub auth_token: Option<Arc<String>>,
    pub availability_zone: Option<String>,
-    pub ingest_batch_size: u64,
 }

 pub struct WalReceiver {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -411,7 +411,6 @@ impl ConnectionManagerState {

        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
-        let ingest_batch_size = self.conf.ingest_batch_size;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -431,7 +430,6 @@ impl ConnectionManagerState {
                    connect_timeout,
                    ctx,
                    node_id,
-                    ingest_batch_size,
                )
                .await;

@@ -1337,7 +1335,7 @@ mod tests {

        ConnectionManagerState {
            id: TenantTimelineId {
-                tenant_id: harness.tenant_shard_id.tenant_id,
+                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
            },
            timeline,
@@ -1347,7 +1345,6 @@ mod tests {
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                auth_token: None,
                availability_zone: None,
-                ingest_batch_size: 1,
            },
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
@@ -106,7 +106,6 @@ impl From<WalDecodeError> for WalReceiverError {

 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
-#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
    wal_source_connconf: PgConnectionConfig,
@@ -115,7 +114,6 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -307,9 +305,7 @@ pub(super) async fn handle_walreceiver_connection(

                {
                    let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(startlsn);
-                    let mut uncommitted_records = 0;
-                    let mut filtered_records = 0;
+                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
@@ -318,40 +314,14 @@ pub(super) async fn handle_walreceiver_connection(
                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                        }

-                        // Ingest the records without immediately committing them.
-                        let ingested = walingest
+                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                            .await
                            .with_context(|| format!("could not ingest record at {lsn}"))?;
-                        if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-                            WAL_INGEST.records_filtered.inc();
-                            filtered_records += 1;
-                        }

                        fail_point!("walreceiver-after-ingest");

                        last_rec_lsn = lsn;
-
-                        // Commit every ingest_batch_size records. Even if we filtered out
-                        // all records, we still need to call commit to advance the LSN.
-                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
-                        }
-                    }
-
-                    // Commit the remaining records.
-                    if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
                    }
                }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,8 +18,7 @@ use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::time::Instant;
+use std::sync::{RwLock, RwLockWriteGuard};
 use utils::fs_ext;

 ///
@@ -112,7 +111,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -144,7 +143,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().await;
+                slot_guard = slot.inner.write().unwrap();
                index = next;
                break;
            }
@@ -251,29 +250,6 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
    }
 }

-/// Observe duration for the given storage I/O operation
-///
-/// Unlike `observe_closure_duration`, this supports async,
-/// where "support" means that we measure wall clock time.
-macro_rules! observe_duration {
-    ($op:expr, $($body:tt)*) => {{
-        let instant = Instant::now();
-        let result = $($body)*;
-        let elapsed = instant.elapsed().as_secs_f64();
-        STORAGE_IO_TIME_METRIC
-            .get($op)
-            .observe(elapsed);
-        result
-    }}
-}
-
-macro_rules! with_file {
-    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
-        let $ident = $this.lock_file().await?;
-        observe_duration!($op, $($body)*)
-    }};
-}
-
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -310,12 +286,14 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
        // where our caller doesn't get to use the returned VirtualFile before its
        // slot gets re-used by someone else.
-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -388,24 +366,22 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
+            .await?
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
+            .await?
    }

-    /// Helper function internal to `VirtualFile` that looks up the underlying File,
-    /// opens it and evicts some other File if necessary. The passed parameter is
-    /// assumed to be a function available for the physical `File`.
-    ///
-    /// We are doing it via a macro as Rust doesn't support async closures that
-    /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -415,23 +391,27 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().await;
+            let mut handle = *self.handle.read().unwrap();
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().await;
-                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
-                        // Found a cached file descriptor.
-                        slot.recently_used.store(true, Ordering::Relaxed);
-                        return Ok(FileGuard { slot_guard });
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(STORAGE_IO_TIME_METRIC
+                                .get(op)
+                                .observe_closure_duration(|| func(file)));
+                        }
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().await;
+                let handle_guard = self.handle.write().unwrap();

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -445,16 +425,20 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot().await;
+        let (handle, mut slot_guard) = open_files.find_victim_slot();

        // Re-open the physical file.
        // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
        // case from StorageIoOperation::Open. This helps with identifying thrashing
        // of the virtual file descriptor cache.
-        let file = observe_duration!(
-            StorageIoOperation::OpenAfterReplace,
-            self.open_options.open(&self.path)
-        )?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::OpenAfterReplace)
+            .observe_closure_duration(|| self.open_options.open(&self.path))?;
+
+        // Perform the requested operation on it
+        let result = STORAGE_IO_TIME_METRIC
+            .get(op)
+            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -462,9 +446,7 @@ impl VirtualFile {

        *handle_guard = handle;

-        return Ok(FileGuard {
-            slot_guard: slot_guard.downgrade(),
-        });
+        Ok(result)
    }

    pub fn remove(self) {
@@ -479,9 +461,11 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = self
+                    .with_file(StorageIoOperation::Seek, |mut file| {
+                        file.seek(SeekFrom::End(offset))
+                    })
+                    .await??
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -569,9 +553,9 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -581,9 +565,9 @@ impl VirtualFile {
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -593,18 +577,6 @@ impl VirtualFile {
    }
 }

-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
-}
-
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
-        // This unwrap is safe because we only create `FileGuard`s
-        // if we know that the file is Some.
-        self.slot_guard.file.as_ref().unwrap()
-    }
-}
-
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -637,41 +609,22 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut();
+        let handle = self.handle.get_mut().unwrap();

-        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
-            if slot_guard.tag == tag {
-                slot.recently_used.store(false, Ordering::Relaxed);
-                // there is also operation "close-by-replace" for closes done on eviction for
-                // comparison.
-                if let Some(fd) = slot_guard.file.take() {
-                    STORAGE_IO_TIME_METRIC
-                        .get(StorageIoOperation::Close)
-                        .observe_closure_duration(|| drop(fd));
-                }
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
+        let slot = &get_open_files().slots[handle.index];
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            if let Some(fd) = slot_guard.file.take() {
+                STORAGE_IO_TIME_METRIC
+                    .get(StorageIoOperation::Close)
+                    .observe_closure_duration(|| drop(fd));
            }
        }
-
-        // We don't have async drop so we cannot directly await the lock here.
-        // Instead, first do a best-effort attempt at closing the underlying
-        // file descriptor by using `try_write`, and if that fails, spawn
-        // a tokio task to do it asynchronously: we just want it to be
-        // cleaned up eventually.
-        // Most of the time, the `try_lock` should succeed though,
-        // as we have `&mut self` access. In other words, if the slot
-        // is still occupied by our file, there should be no access from
-        // other I/O operations; the only other possible place to lock
-        // the slot is the lock algorithm looking for free slots.
-        let slot = &get_open_files().slots[handle.index];
-        if let Ok(slot_guard) = slot.inner.try_write() {
-            clean_slot(slot, slot_guard, handle.tag);
-        } else {
-            let tag = handle.tag;
-            tokio::spawn(async move {
-                let slot_guard = slot.inner.write().await;
-                clean_slot(slot, slot_guard, tag);
-            });
-        };
    }
 }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,7 +29,6 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
-use utils::failpoint_support;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -48,18 +47,20 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

-pub struct WalIngest {
+pub struct WalIngest<'a> {
    shard: ShardIdentity,
+    timeline: &'a Timeline,
+
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
 }

-impl WalIngest {
+impl<'a> WalIngest<'a> {
    pub async fn new(
-        timeline: &Timeline,
+        timeline: &'a Timeline,
        startpoint: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -68,6 +69,7 @@ impl WalIngest {

        Ok(WalIngest {
            shard: *timeline.get_shard_identity(),
+            timeline,
            checkpoint,
            checkpoint_modified: false,
        })
@@ -81,8 +83,6 @@ impl WalIngest {
    /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
    /// relations/pages that the record affects.
    ///
-    /// This function returns `true` if the record was ingested, and `false` if it was filtered out
-    ///
    pub async fn ingest_record(
        &mut self,
        recdata: Bytes,
@@ -90,13 +90,11 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> anyhow::Result<()> {
        WAL_INGEST.records_received.inc();
-        let pg_version = modification.tline.pg_version;
-        let prev_len = modification.len();

-        modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+        modification.lsn = lsn;
+        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;

        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);
@@ -133,9 +131,9 @@ impl WalIngest {
            }
            pg_constants::RM_DBASE_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, %pg_version, "handle RM_DBASE_ID");
+                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");

-                if pg_version == 14 {
+                if self.timeline.pg_version == 14 {
                    if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
                        let createdb = XlCreateDatabase::decode(&mut buf);
                        debug!("XLOG_DBASE_CREATE v14");
@@ -151,7 +149,7 @@ impl WalIngest {
                                .await?;
                        }
                    }
-                } else if pg_version == 15 {
+                } else if self.timeline.pg_version == 15 {
                    if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -171,7 +169,7 @@ impl WalIngest {
                                .await?;
                        }
                    }
-                } else if pg_version == 16 {
+                } else if self.timeline.pg_version == 16 {
                    if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -346,7 +344,9 @@ impl WalIngest {
                        // particular point in the WAL. For more fine-grained control,
                        // we could peek into the message and only pause if it contains
                        // a particular string, for example, but this is enough for now.
-                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
+                        crate::failpoint_support::sleep_millis_async!(
+                            "wal-ingest-logical-message-sleep"
+                        );
                    } else if let Some(path) = prefix.strip_prefix("neon-file:") {
                        modification.put_file(path, message, ctx).await?;
                    }
@@ -400,11 +400,19 @@ impl WalIngest {
            self.checkpoint_modified = false;
        }

-        // Note that at this point this record is only cached in the modification
-        // until commit() is called to flush the data into the repository and update
-        // the latest LSN.
+        if modification.is_empty() {
+            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+            WAL_INGEST.records_filtered.inc();
+            modification.tline.finish_write(lsn);
+        } else {
+            WAL_INGEST.records_committed.inc();
+            modification.commit(ctx).await?;
+        }

-        Ok(modification.len() > prev_len)
+        // Now that this record has been fully handled, including updating the
+        // checkpoint data, let the repository know that it is up-to-date to this LSN.
+
+        Ok(())
    }

    /// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -451,7 +459,7 @@ impl WalIngest {
            && (decoded.xl_info == pg_constants::XLOG_FPI
                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
            // do not materialize null pages because them most likely be soon replaced with real data
            && blk.bimg_len != 0
        {
@@ -504,7 +512,7 @@ impl WalIngest {
        let mut old_heap_blkno: Option<u32> = None;
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

-        match modification.tline.pg_version {
+        match self.timeline.pg_version {
            14 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -728,7 +736,7 @@ impl WalIngest {
            // replaying it would fail to find the previous image of the page, because
            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
            if let Some(blknum) = new_vm_blk {
                if blknum >= vm_size {
                    new_vm_blk = None;
@@ -809,11 +817,10 @@ impl WalIngest {
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-        let pg_version = modification.tline.pg_version;

        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

-        match pg_version {
+        match self.timeline.pg_version {
            16 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -876,7 +883,7 @@ impl WalIngest {
            }
            _ => bail!(
                "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                pg_version
+                self.timeline.pg_version
            ),
        }

@@ -899,7 +906,7 @@ impl WalIngest {
            // replaying it would fail to find the previous image of the page, because
            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
            if let Some(blknum) = new_vm_blk {
                if blknum >= vm_size {
                    new_vm_blk = None;
@@ -977,14 +984,16 @@ impl WalIngest {
        let src_db_id = rec.src_db_id;
        let src_tablespace_id = rec.src_tablespace_id;

+        // Creating a database is implemented by copying the template (aka. source) database.
+        // To copy all the relations, we need to ask for the state as of the same LSN, but we
+        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
+        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
+        // get calls instead.
+        let req_lsn = modification.tline.get_last_record_lsn();
+
        let rels = modification
            .tline
-            .list_rels(
-                src_tablespace_id,
-                src_db_id,
-                Version::Modified(modification),
-                ctx,
-            )
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
            .await?;

        debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -992,12 +1001,7 @@ impl WalIngest {
        // Copy relfilemap
        let filemap = modification
            .tline
-            .get_relmap_file(
-                src_tablespace_id,
-                src_db_id,
-                Version::Modified(modification),
-                ctx,
-            )
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
            .await?;
        modification
            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1011,7 +1015,7 @@ impl WalIngest {

            let nblocks = modification
                .tline
-                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                .await?;
            let dst_rel = RelTag {
                spcnode: tablespace_id,
@@ -1029,13 +1033,7 @@ impl WalIngest {

                let content = modification
                    .tline
-                    .get_rel_page_at_lsn(
-                        src_rel,
-                        blknum,
-                        Version::Modified(modification),
-                        true,
-                        ctx,
-                    )
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                    .await?;
                modification.put_rel_page_image(dst_rel, blknum, content)?;
                num_blocks_copied += 1;
@@ -1106,7 +1104,7 @@ impl WalIngest {
                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                fsm_physical_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
            if nblocks > fsm_physical_page_no {
                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1128,7 +1126,7 @@ impl WalIngest {
                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                vm_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
            if nblocks > vm_page_no {
                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1201,9 +1199,10 @@ impl WalIngest {
                    dbnode: xnode.dbnode,
                    relnode: xnode.relnode,
                };
+                let last_lsn = self.timeline.get_last_record_lsn();
                if modification
                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                    .await?
                {
                    self.put_rel_drop(modification, rel, ctx).await?;
@@ -1257,9 +1256,10 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
+        let req_lsn = modification.tline.get_last_record_lsn();
        for segno in modification
            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
            .await?
        {
            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1471,6 +1471,20 @@ impl WalIngest {
        Ok(())
    }

+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
+            0
+        } else {
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
+        };
+        Ok(nblocks)
+    }
+
    async fn handle_rel_extend(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -1482,6 +1496,7 @@ impl WalIngest {
        // Check if the relation exists. We implicitly create relations on first
        // record.
        // TODO: would be nice if to be more explicit about it
+        let last_lsn = modification.lsn;

        // Get current size and put rel creation if rel doesn't exist
        //
@@ -1489,14 +1504,11 @@ impl WalIngest {
        //       check the cache too. This is because eagerly checking the cache results in
        //       less work overall and 10% better performance. It's more work on cache miss
        //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = modification
-            .tline
-            .get_cached_rel_size(&rel, modification.get_lsn())
-        {
+        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
            nblocks
-        } else if !modification
-            .tline
-            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        } else if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1506,10 +1518,7 @@ impl WalIngest {
                .context("Relation Error")?;
            0
        } else {
-            modification
-                .tline
-                .get_rel_size(rel, Version::Modified(modification), true, ctx)
-                .await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
        };

        if new_nblocks > old_nblocks {
@@ -1562,9 +1571,10 @@ impl WalIngest {
        // Check if the relation exists. We implicitly create relations on first
        // record.
        // TODO: would be nice if to be more explicit about it
-        let old_nblocks = if !modification
-            .tline
-            .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
+        let last_lsn = self.timeline.get_last_record_lsn();
+        let old_nblocks = if !self
+            .timeline
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1573,9 +1583,8 @@ impl WalIngest {
                .await?;
            0
        } else {
-            modification
-                .tline
-                .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
+            self.timeline
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                .await?
        };

@@ -1598,26 +1607,6 @@ impl WalIngest {
    }
 }

-async fn get_relsize(
-    modification: &DatadirModification<'_>,
-    rel: RelTag,
-    ctx: &RequestContext,
-) -> anyhow::Result<BlockNumber> {
-    let nblocks = if !modification
-        .tline
-        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
-        .await?
-    {
-        0
-    } else {
-        modification
-            .tline
-            .get_rel_size(rel, Version::Modified(modification), true, ctx)
-            .await?
-    };
-    Ok(nblocks)
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
@@ -1644,7 +1633,10 @@ mod tests {

    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1689,29 +1681,29 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
            .await
            .is_err());
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            3
        );
@@ -1719,46 +1711,46 @@ mod tests {
        // Check page contents at each LSN
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 2")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 2 at 5")
        );
@@ -1774,19 +1766,19 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );
@@ -1794,13 +1786,13 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            3
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 2 at 5")
        );
@@ -1813,7 +1805,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
                .await?,
            0
        );
@@ -1826,19 +1818,19 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                .await?,
            ZERO_PAGE
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1")
        );
@@ -1851,21 +1843,21 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            1501
        );
        for blk in 2..1500 {
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                    .await?,
                ZERO_PAGE
            );
        }
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1500")
        );
@@ -1892,13 +1884,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            1
        );
@@ -1911,7 +1903,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
                .await?,
            false
        );
@@ -1929,13 +1921,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
                .await?,
            1
        );
@@ -1968,24 +1960,24 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
            .await
            .is_err());

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            relsize
        );
@@ -1996,7 +1988,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2013,7 +2005,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
                .await?,
            1
        );
@@ -2023,7 +2015,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2032,7 +2024,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            relsize
        );
@@ -2041,7 +2033,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2061,13 +2053,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            relsize
        );
@@ -2077,7 +2069,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2110,9 +2102,7 @@ mod tests {
        assert_current_logical_size(&tline, Lsn(lsn));

        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE + 1
        );

@@ -2124,9 +2114,7 @@ mod tests {
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
        );
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -2139,9 +2127,7 @@ mod tests {
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
        );
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -2157,9 +2143,7 @@ mod tests {
                .await?;
            m.commit(&ctx).await?;
            assert_eq!(
-                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                    .await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
            );

@@ -2196,7 +2180,7 @@ mod tests {
        let wal_segment_path = format!("{path}/000000010000000000000001.zst");
        let source_initdb_path = format!("{path}/{INITDB_PATH}");
        let startpoint = Lsn::from_hex("14AEC08").unwrap();
-        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
+        let endpoint = Lsn::from_hex("1FFFF98").unwrap();

        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
        let (tenant, ctx) = harness.load().await;
@@ -2238,7 +2222,7 @@ mod tests {
        let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
            .await
            .unwrap();
-        let mut modification = tline.begin_modification(startpoint);
+        let mut modification = tline.begin_modification(endpoint);
        let mut decoded = DecodedWALRecord::default();
        println!("decoding {} bytes", bytes.len() - xlogoff);

@@ -2252,7 +2236,6 @@ mod tests {
                    .await
                    .unwrap();
            }
-            modification.commit(&ctx).await.unwrap();
        }

        let duration = started_at.elapsed();
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,6 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
-use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -36,11 +35,14 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};

+#[cfg(feature = "testing")]
+use pageserver_api::shard::TenantShardId;
+
 use crate::config::PageServerConf;
 use crate::metrics::{
    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -90,7 +92,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -184,13 +186,10 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-    ) -> PostgresRedoManager {
+    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
-            tenant_shard_id,
+            tenant_id,
            conf,
            last_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
@@ -245,12 +244,8 @@ impl PostgresRedoManager {
                                let timer =
                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                let proc = Arc::new(
-                                    WalRedoProcess::launch(
-                                        self.conf,
-                                        self.tenant_shard_id,
-                                        pg_version,
-                                    )
-                                    .context("launch walredo process")?,
+                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
+                                        .context("launch walredo process")?,
                                );
                                timer.observe_duration();
                                *proc_guard = Some(Arc::clone(&proc));
@@ -643,7 +638,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
    #[allow(dead_code)]
    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    // Some() on construction, only becomes None on Drop.
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
@@ -657,10 +652,10 @@ impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
    fn launch(
        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -685,7 +680,7 @@ impl WalRedoProcess {
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
            .close_fds()
-            .spawn_no_leak_child(tenant_shard_id)
+            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
@@ -746,12 +741,12 @@ impl WalRedoProcess {
                        error!(error=?e, "failed to read from walredo stderr");
                    }
                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
        );

        Ok(Self {
            conf,
-            tenant_shard_id,
+            tenant_id,
            child: Some(child),
            stdin: Mutex::new(ProcessInput {
                stdin,
@@ -777,7 +772,7 @@ impl WalRedoProcess {
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
    fn apply_wal_records(
        &self,
        tag: BufferTag,
@@ -971,7 +966,11 @@ impl WalRedoProcess {
        // these files will be collected to an allure report
        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());

-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
+        let path = self
+            .conf
+            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
+            .join(&filename);

        let res = std::fs::OpenOptions::new()
            .write(true)
@@ -1005,7 +1004,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantShardId,
+    tenant_id: TenantId,
    child: Option<Child>,
 }

@@ -1024,7 +1023,7 @@ impl DerefMut for NoLeakChild {
 }

 impl NoLeakChild {
-    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
        let child = command.spawn()?;
        Ok(NoLeakChild {
            tenant_id,
@@ -1079,7 +1078,7 @@ impl Drop for NoLeakChild {
            Some(child) => child,
            None => return,
        };
-        let tenant_shard_id = self.tenant_id;
+        let tenant_id = self.tenant_id;
        // Offload the kill+wait of the child process into the background.
        // If someone stops the runtime, we'll leak the child process.
        // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1087,11 +1086,7 @@ impl Drop for NoLeakChild {
            tokio::task::spawn_blocking(move || {
                // Intentionally don't inherit the tracing context from whoever is dropping us.
                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
+                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
@@ -1101,11 +1096,11 @@ impl Drop for NoLeakChild {
 }

 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
 }

 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
        NoLeakChild::spawn(tenant_id, self)
    }
 }
@@ -1160,7 +1155,6 @@ mod tests {
    use crate::repository::Key;
    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
    use bytes::Bytes;
-    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
    use utils::{id::TenantId, lsn::Lsn};

@@ -1270,9 +1264,9 @@ mod tests {
            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
-            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+            let tenant_id = TenantId::generate();

-            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+            let manager = PostgresRedoManager::new(conf, tenant_id);

            Ok(RedoHarness {
                _repo_dir: repo_dir,
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,8 +35,7 @@

 #define PageStoreTrace DEBUG5

-#define MIN_RECONNECT_INTERVAL_USEC 1000
-#define MAX_RECONNECT_INTERVAL_USEC 1000000
+#define RECONNECT_INTERVAL_USEC 1000000

 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
@@ -134,11 +133,6 @@ pageserver_connect(int elevel)
 	const char *values[3];
 	int			n;

-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-        uint64_t us_since_last_connect;
-
 	Assert(!connected);

 	if (CheckConnstringUpdated())
@@ -146,22 +140,6 @@ pageserver_connect(int elevel)
 		ReloadConnstring();
 	}

-	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
-	{
-		pg_usleep(delay_us - us_since_last_connect);
-		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
-	}
-	else
-	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
-	}
-
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -355,6 +333,7 @@ pageserver_send(NeonRequest *request)
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
+			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
 		n_reconnect_attempts = 0;
 	}
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		port = strchr(host, ':');
 		if (port == NULL)
 		{
-			wp_log(FATAL, "port is not specified");
+			walprop_log(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			wp_log(FATAL, "too many safekeepers");
+			walprop_log(FATAL, "Too many safekeepers");
 		}
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
-				wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+				walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}

 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		wp_log(FATAL, "safekeepers addresses are not specified");
+		walprop_log(FATAL, "Safekeepers addresses are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;

@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
 	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
-		wp_log(FATAL, "neon.timeline_id is not provided");
+		walprop_log(FATAL, "neon.timeline_id is not provided");
 	if (*wp->config->neon_timeline != '\0' &&
 		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+		walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
 	if (!wp->config->neon_tenant)
-		wp_log(FATAL, "neon.tenant_id is not provided");
+		walprop_log(FATAL, "neon.tenant_id is not provided");
 	if (*wp->config->neon_tenant != '\0' &&
 		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+		walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);

 	wp->greetRequest.timeline = wp->config->pgTimeline;
 	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
-					wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-						   sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
+					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
+								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
 		 *
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
-		wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
-			   sk->host, sk->port, wp->api.conn_error_message(sk));
+		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
+					sk->host, sk->port, wp->api.conn_error_message(sk));

 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
+	walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);

 	sk->state = SS_CONNECTING_WRITE;
 	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
-#ifdef WALPROPOSER_LIB			/* wp_log needs wp in lib build */
+#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
 #endif

@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
-				   sk->host, sk->port);
+			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
+						sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */

@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * requests.
 			 */
 		case SS_VOTING:
-			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				   sk->port, FormatSafekeeperState(sk));
+			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;

@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				   sk->port, FormatSafekeeperState(sk));
+			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;

@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			wp_log(LOG, "connected with node %s:%s", sk->host,
-				   sk->port);
+			walprop_log(LOG, "connected with node %s:%s", sk->host,
+						sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 			/*
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;

 		case WP_CONN_POLLING_FAILED:
-			wp_log(WARNING, "failed to connect to node '%s:%s': %s",
-				   sk->host, sk->port, wp->api.conn_error_message(sk));
+			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
+						sk->host, sk->port, wp->api.conn_error_message(sk));

 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)

 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			   sk->host, sk->port, wp->api.conn_error_message(sk));
+		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+					sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			break;

 		case WP_EXEC_FAILED:
-			wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
-				   sk->host, sk->port, wp->api.conn_error_message(sk));
+			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
+						sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;

@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 * wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
-				   sk->host, sk->port);
+			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
+						sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;

-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);

 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		if (wp->n_connected == wp->quorum)
 		{
 			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
+			walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);

 			wp->voteRequest = (VoteRequest)
 			{
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	else if (sk->greetResponse.term > wp->propTerm)
 	{
 		/* Another compute with higher term is running. */
-		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			   sk->host, sk->port,
-			   sk->greetResponse.term, wp->propTerm);
+		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+					sk->host, sk->port,
+					sk->greetResponse.term, wp->propTerm);
 	}

 	/*
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
 	WalProposer *wp = sk->wp;

 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
+	walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
 		return;
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;

-	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+	walprop_log(LOG,
+				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
 	if ((!sk->voteResponse.voteGiven) &&
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
-		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			   sk->host, sk->port,
-			   sk->voteResponse.term, wp->propTerm);
+		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+					sk->host, sk->port,
+					sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);

@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
 	 */
 	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		wp_log(FATAL, "failed to download WAL for logical replicaiton");
+		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
 	}

 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
-					wp_log(WARNING,
-						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
-						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+					walprop_log(WARNING,
+								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+								LSN_FORMAT_ARGS(wp->timelineStartLsn),
+								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 		{
 			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
 		}
-		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+		walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}

 	/*
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;

-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
-		   wp->propTerm,
-		   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-		   LSN_FORMAT_ARGS(wp->truncateLsn));
+	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+				wp->quorum,
+				wp->propTerm,
+				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+				LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				 * scenario.
 				 */
 				disable_core_dump();
-				wp_log(PANIC,
-					   "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-					   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-					   LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+				walprop_log(PANIC,
+							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
@@ -1091,10 +1091,34 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
-		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
-		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
-		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
+
+		if (sk->startStreamingAt < wp->truncateLsn)
+		{
+			/*
+			 * There's a gap between the WAL starting point and a truncateLsn,
+			 * which can't appear in a normal working cluster. That gap means
+			 * that all safekeepers reported that they have persisted WAL up
+			 * to the truncateLsn before, but now current safekeeper tells
+			 * otherwise.
+			 *
+			 * Also we have a special condition here, which is empty
+			 * safekeeper with no history. In combination with a gap, that can
+			 * happen when we introduce a new safekeeper to the cluster. This
+			 * is a rare case, which is triggered manually for now, and should
+			 * be treated with care.
+			 */
+
+			/*
+			 * truncateLsn will not change without ack from current
+			 * safekeeper, and it's aligned to the WAL record, so we can
+			 * safely start streaming from this point.
+			 */
+			sk->startStreamingAt = wp->truncateLsn;
+
+			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+						LSN_FORMAT_ARGS(sk->startStreamingAt));
+		}
 	}
 	else
 	{
@@ -1117,7 +1141,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}

-	Assert(sk->startStreamingAt <= wp->availableLsn);
+	Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);

 	msg.tag = 'e';
 	msg.term = wp->propTerm;
@@ -1126,9 +1150,9 @@ SendProposerElected(Safekeeper *sk)
 	msg.timelineStartLsn = wp->timelineStartLsn;

 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
-	wp_log(LOG,
-		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+	walprop_log(LOG,
+				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));

 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1237,8 +1261,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
 	if (events & WL_SOCKET_CLOSED)
 	{
-		wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
-			   sk->host, sk->port);
+		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+					sk->host, sk->port);
 		ShutdownConnection(sk);
 		return;
 	}
@@ -1299,12 +1323,12 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);

-			wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-				   req->endLsn - req->beginLsn,
-				   LSN_FORMAT_ARGS(req->beginLsn),
-				   LSN_FORMAT_ARGS(req->endLsn),
-				   LSN_FORMAT_ARGS(req->commitLsn),
-				   LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);

 			resetStringInfo(&sk->outbuf);

@@ -1331,8 +1355,8 @@ SendAppendRequests(Safekeeper *sk)
 				case NEON_WALREAD_WOULDBLOCK:
 					return true;
 				case NEON_WALREAD_ERROR:
-					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-						   sk->host, sk->port, errmsg);
+					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
+								sk->host, sk->port, errmsg);
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1364,9 +1388,9 @@ SendAppendRequests(Safekeeper *sk)
 					return true;

 				case PG_ASYNC_WRITE_FAIL:
-					wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-						   sk->host, sk->port, FormatSafekeeperState(sk),
-						   wp->api.conn_error_message(sk));
+					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+								sk->host, sk->port, FormatSafekeeperState(sk),
+								wp->api.conn_error_message(sk));
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1405,11 +1429,11 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;

-		wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-			   sk->appendResponse.term,
-			   LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-			   sk->host, sk->port);
+		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+					sk->appendResponse.term,
+					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+					sk->host, sk->port);

 		if (sk->appendResponse.term > wp->propTerm)
 		{
@@ -1419,9 +1443,9 @@ RecvAppendResponses(Safekeeper *sk)
 			 * core as this is kinda expected scenario.
 			 */
 			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-				   sk->host, sk->port,
-				   sk->appendResponse.term, wp->propTerm);
+			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+						sk->host, sk->port,
+						sk->appendResponse.term, wp->propTerm);
 		}

 		readAnything = true;
@@ -1465,32 +1489,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				   rf->currentClusterSize);
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
+						rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->last_received_lsn));
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+						LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1502,8 +1526,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese

 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					   rf->replytime, replyTimeStr);
+				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+							rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
 			}
@@ -1517,7 +1541,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
+			walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1582,7 +1606,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)

 	if (wp->n_votes < wp->quorum)
 	{
-		wp_log(WARNING, "GetDonor called before elections are won");
+		walprop_log(WARNING, "GetDonor called before elections are won");
 		return NULL;
 	}

@@ -1710,9 +1734,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 			return false;

 		case PG_ASYNC_READ_FAIL:
-			wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
-				   sk->port, FormatSafekeeperState(sk),
-				   wp->api.conn_error_message(sk));
+			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
+						sk->port, FormatSafekeeperState(sk),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1750,8 +1774,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			   sk->port, FormatSafekeeperState(sk));
+		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+					sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1827,9 +1851,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes

 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
-		wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-			   sk->host, sk->port, FormatSafekeeperState(sk),
-			   wp->api.conn_error_message(sk));
+		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
+					sk->host, sk->port, FormatSafekeeperState(sk),
+					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1880,9 +1904,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
-			wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-				   sk->host, sk->port, FormatSafekeeperState(sk),
-				   wp->api.conn_error_message(sk));
+			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
+						sk->host, sk->port, FormatSafekeeperState(sk),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1919,9 +1943,9 @@ AsyncFlush(Safekeeper *sk)
 			/* Nothing to do; try again when the socket's ready */
 			return false;
 		case -1:
-			wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
-				   sk->host, sk->port, FormatSafekeeperState(sk),
-				   wp->api.conn_error_message(sk));
+			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+						sk->host, sk->port, FormatSafekeeperState(sk),
+						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1950,11 +1974,11 @@ CompareLsn(const void *a, const void *b)
 *
 * The strings are intended to be used as a prefix to "state", e.g.:
 *
- *   wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
 *
 * If this sort of phrasing doesn't fit the message, instead use something like:
 *
- *   wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
 */
 static char *
 FormatSafekeeperState(Safekeeper *sk)
@@ -2035,8 +2059,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * To give a descriptive message in the case of failure, we use elog
 		 * and then an assertion that's guaranteed to fail.
 		 */
-		wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			   FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
+		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2175,8 +2199,8 @@ FormatEvents(WalProposer *wp, uint32 events)

 	if (events & (~all_flags))
 	{
-		wp_log(WARNING, "event formatting found unexpected component %d",
-			   events & (~all_flags));
+		walprop_log(WARNING, "Event formatting found unexpected component %d",
+					events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -707,23 +707,11 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */

-#define WP_LOG_PREFIX "[WP] "
-
-/*
- * wp_log is used in pure wp code (walproposer.c), allowing API callback to
- * catch logging.
- */
 #ifdef WALPROPOSER_LIB
 extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
+#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
-#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
+#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
 #endif

-/*
- * And wpg_log is used all other (postgres specific) walproposer code, just
- * adding prefix.
- */
-#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
-
 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;

-	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
-			LSN_FORMAT_ARGS(startpos));
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
 {
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
-		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }

 /* Helper function */
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
 	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		wpg_log(FATAL, "failed to allocate new PGconn object");
+		elog(FATAL, "failed to allocate new PGconn object");

 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
 			 * unused. We'll expect it's never returned.
 			 */
 		case PGRES_POLLING_ACTIVE:
-			wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");

 			/*
 			 * This return is never actually reached, but it's here to make
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	 */
 	if (!result)
 	{
-		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
 		return WP_EXEC_UNEXPECTED_SUCCESS;
 	}

@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	}

 	if (unexpected_success)
-		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);

 	return return_val;
 }
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));

 				if (status != PGRES_FATAL_ERROR)
-					wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);

 				/*
 				 * If there was actually an error, it'll be properly reported
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
 	}

 	/*
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			wpg_log(FATAL, "invalid return %d from PQflush", result);
+			elog(FATAL, "invalid return %d from PQflush", result);
 	}
 }

@@ -1237,6 +1237,19 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 		return true;			/* recovery not needed */
 	endpos = wp->propEpochStartLsn;

+	/*
+	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
+	 * avoid risk of exploding pg_wal. Logical replication won't work until
+	 * recreated, but at least compute would start; this also follows
+	 * max_slot_wal_keep_size semantics.
+	 */
+	download_range_mb = (endpos - startpos) / 1024 / 1024;
+	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
+	{
+		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
+		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
+					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
+	}
 	timeline = wp->greetRequest.timeline;

 	if (!neon_auth_token)
@@ -1249,7 +1262,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)

 		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
 		if (written > MAXCONNINFO || written < 0)
-			wpg_log(FATAL, "could not append password to the safekeeper connection string");
+			elog(FATAL, "could not append password to the safekeeper connection string");
 	}

 #if PG_MAJORVERSION_NUM < 16
@@ -1266,11 +1279,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 						err)));
 		return false;
 	}
-	wpg_log(LOG,
-			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-			"%d",
-			sk->host, sk->port, (uint32) (startpos >> 32),
-			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+	elog(LOG,
+		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 sk->host, sk->port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);

 	options.logical = false;
 	options.startpoint = startpos;
@@ -1468,11 +1481,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
 	char		log_prefix[64];

-	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
+	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
 	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
-		wpg_log(FATAL, "failed to allocate xlog reader");
+		elog(FATAL, "Failed to allocate xlog reader");
 }

 static NeonWALReadResult
@@ -1536,7 +1549,7 @@ static void
 walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
-		wpg_log(FATAL, "double-initialization of event set");
+		elog(FATAL, "double-initialization of event set");

 	/* for each sk, we have socket plus potentially socket for neon walreader */
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
@@ -1568,7 +1581,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
 	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
-	wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
+	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }

 static void
@@ -1667,8 +1680,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 {
 	WalProposer *wp = to_remove->wp;

-	wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-			to_remove->host, to_remove->port, is_sk);
+	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+		 to_remove->host, to_remove->port, is_sk);

 	/*
 	 * Shortpath for exiting if have nothing to do. We never call this
@@ -1822,13 +1835,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
 	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;

-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
+	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
+		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
+		 rf->currentClusterSize,
+		 LSN_FORMAT_ARGS(rf->last_received_lsn),
+		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+		 rf->replytime);
 }

 /*
@@ -1974,7 +1987,7 @@ GetLogRepRestartLSN(WalProposer *wp)
 		{
 			uint64		download_range_mb;

-			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));

 			/*
 			 * If we need to download more than a max_slot_wal_keep_size,
@@ -1986,8 +1999,8 @@ GetLogRepRestartLSN(WalProposer *wp)
 			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
 			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
 				return InvalidXLogRecPtr;
 			}

--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -158,28 +158,6 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"

-[[package]]
-name = "anyio"
-version = "4.2.0"
-description = "High level compatibility layer for multiple asynchronous event loop implementations"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
-    {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
-]
-
-[package.dependencies]
-exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
-idna = ">=2.8"
-sniffio = ">=1.1"
-typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
-
-[package.extras]
-doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
-test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
-trio = ["trio (>=0.23)"]
-
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -311,20 +289,69 @@ files = [
 ]

 [[package]]
-name = "boto3"
-version = "1.34.11"
-description = "The AWS SDK for Python"
+name = "black"
+version = "23.3.0"
+description = "The uncompromising code formatter."
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
-    {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
+    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
+    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
+    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
+    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
+    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
+    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
+    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
+    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
+    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
+    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
+    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
+    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
+    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
 ]

 [package.dependencies]
-botocore = ">=1.34.11,<1.35.0"
+click = ">=8.0.0"
+mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
+pathspec = ">=0.9.0"
+platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+colorama = ["colorama (>=0.4.3)"]
+d = ["aiohttp (>=3.7.4)"]
+jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
+uvloop = ["uvloop (>=0.15.2)"]
+
+[[package]]
+name = "boto3"
+version = "1.26.16"
+description = "The AWS SDK for Python"
+optional = false
+python-versions = ">= 3.7"
+files = [
+    {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
+    {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
+]
+
+[package.dependencies]
+botocore = ">=1.29.16,<1.30.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.10.0,<0.11.0"
+s3transfer = ">=0.6.0,<0.7.0"

 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -675,25 +702,22 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]

 [[package]]
 name = "botocore"
-version = "1.34.11"
+version = "1.29.16"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">= 3.7"
 files = [
-    {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
-    {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
+    {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
+    {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
 ]

 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = [
-    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
-    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
-]
+urllib3 = ">=1.25.4,<1.27"

 [package.extras]
-crt = ["awscrt (==0.19.19)"]
+crt = ["awscrt (==0.14.0)"]

 [[package]]
 name = "botocore-stubs"
@@ -1086,100 +1110,6 @@ files = [
    {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
 ]

-[[package]]
-name = "h11"
-version = "0.14.0"
-description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
-    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
-]
-
-[[package]]
-name = "h2"
-version = "4.1.0"
-description = "HTTP/2 State-Machine based protocol implementation"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
-    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
-]
-
-[package.dependencies]
-hpack = ">=4.0,<5"
-hyperframe = ">=6.0,<7"
-
-[[package]]
-name = "hpack"
-version = "4.0.0"
-description = "Pure-Python HPACK header compression"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
-    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
-]
-
-[[package]]
-name = "httpcore"
-version = "1.0.2"
-description = "A minimal low-level HTTP client."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "httpcore-1.0.2-py3-none-any.whl", hash = "sha256:096cc05bca73b8e459a1fc3dcf585148f63e534eae4339559c9b8a8d6399acc7"},
-    {file = "httpcore-1.0.2.tar.gz", hash = "sha256:9fc092e4799b26174648e54b74ed5f683132a464e95643b226e00c2ed2fa6535"},
-]
-
-[package.dependencies]
-certifi = "*"
-h11 = ">=0.13,<0.15"
-
-[package.extras]
-asyncio = ["anyio (>=4.0,<5.0)"]
-http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
-trio = ["trio (>=0.22.0,<0.23.0)"]
-
-[[package]]
-name = "httpx"
-version = "0.26.0"
-description = "The next generation HTTP client."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
-    {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
-]
-
-[package.dependencies]
-anyio = "*"
-certifi = "*"
-h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
-httpcore = "==1.*"
-idna = "*"
-sniffio = "*"
-
-[package.extras]
-brotli = ["brotli", "brotlicffi"]
-cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
-http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
-
-[[package]]
-name = "hyperframe"
-version = "6.0.1"
-description = "HTTP/2 framing layer for Python"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
-    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
-]
-
 [[package]]
 name = "idna"
 version = "3.3"
@@ -1234,13 +1164,13 @@ files = [

 [[package]]
 name = "jinja2"
-version = "3.1.3"
+version = "3.1.2"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
-    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
+    {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
+    {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
 ]

 [package.dependencies]
@@ -1694,6 +1624,17 @@ files = [
    {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]

+[[package]]
+name = "pathspec"
+version = "0.9.0"
+description = "Utility library for gitignore style pattern matching of file paths."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+files = [
+    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
+    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
+]
+
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1705,6 +1646,21 @@ files = [
    {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
 ]

+[[package]]
+name = "platformdirs"
+version = "2.5.2"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
+    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
+]
+
+[package.extras]
+docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
+test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -1933,13 +1889,13 @@ files = [

 [[package]]
 name = "pytest"
-version = "7.4.4"
+version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
-    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
 ]

 [package.dependencies]
@@ -1951,7 +1907,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}

 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]

 [[package]]
 name = "pytest-asyncio"
@@ -2248,46 +2204,46 @@ pyasn1 = ">=0.1.3"

 [[package]]
 name = "ruff"
-version = "0.1.11"
-description = "An extremely fast Python linter and code formatter, written in Rust."
+version = "0.0.269"
+description = "An extremely fast Python linter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
-    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
-    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
-    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
-    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
+    {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
+    {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
+    {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
+    {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
+    {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
+    {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
 ]

 [[package]]
 name = "s3transfer"
-version = "0.10.0"
+version = "0.6.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">= 3.7"
 files = [
-    {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
-    {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
+    {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
+    {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
 ]

 [package.dependencies]
-botocore = ">=1.33.2,<2.0a.0"
+botocore = ">=1.12.36,<2.0a.0"

 [package.extras]
-crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]

 [[package]]
 name = "sarif-om"
@@ -2331,17 +2287,6 @@ files = [
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]

-[[package]]
-name = "sniffio"
-version = "1.3.0"
-description = "Sniff out which async library your code is running under"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
-    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
-]
-
 [[package]]
 name = "sshpubkeys"
 version = "3.3.1"
@@ -2505,87 +2450,6 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]

-[[package]]
-name = "websockets"
-version = "12.0"
-description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
-    {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
-    {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
-    {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
-    {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
-    {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
-    {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
-    {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
-    {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
-    {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
-    {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
-    {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
-    {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
-    {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
-    {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
-    {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
-    {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
-    {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
-    {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
-    {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
-    {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
-    {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
-    {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
-    {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
-    {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
-    {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
-    {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
-]
-
 [[package]]
 name = "werkzeug"
 version = "3.0.1"
@@ -2629,6 +2493,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2866,4 +2740,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f750bd06f1937f0614204e0ffe9a293eb61a0d7d675a80d5849f40a22745b5f9"
+content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
    return cmd


-def ruff_check(fix_inplace: bool) -> str:
-    cmd = "poetry run ruff check"
-    if fix_inplace:
-        cmd += " --fix"
+def black(fix_inplace: bool) -> str:
+    cmd = "poetry run black"
+    if not fix_inplace:
+        cmd += " --diff --check"
    return cmd


-def ruff_format(fix_inplace: bool) -> str:
-    cmd = "poetry run ruff format"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff"
+    if fix_inplace:
+        cmd += " --fix"
    return cmd


@@ -109,16 +109,16 @@ if __name__ == "__main__":
        no_color=args.no_color,
    )
    check(
-        name="ruff check",
+        name="black",
        suffix=".py",
-        cmd=ruff_check(fix_inplace=args.fix_inplace),
+        cmd=black(fix_inplace=args.fix_inplace),
        changed_files=files,
        no_color=args.no_color,
    )
    check(
-        name="ruff format",
+        name="ruff",
        suffix=".py",
-        cmd=ruff_format(fix_inplace=args.fix_inplace),
+        cmd=ruff(fix_inplace=args.fix_inplace),
        changed_files=files,
        no_color=args.no_color,
    )
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true

 [features]
-default = ["testing"]
+default = []
 testing = []

 [dependencies]
@@ -14,7 +14,6 @@ async-trait.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
-camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
@@ -27,6 +26,7 @@ hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
 humantime.workspace = true
+hyper-tungstenite.workspace = true
 hyper.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
@@ -35,8 +35,6 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
-parquet.workspace = true
-parquet_derive.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -44,7 +42,6 @@ pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
-remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
@@ -65,13 +62,11 @@ tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
-tokio-tungstenite.workspace = true
 tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-tungstenite.workspace = true
 url.workspace = true
 utils.workspace = true
 uuid.workspace = true
@@ -80,13 +75,11 @@ x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
-redis.workspace = true
 smol_str.workspace = true

 workspace_hack.workspace = true

 [dev-dependencies]
-camino-tempfile.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,7 +4,7 @@ pub mod backend;
 pub use backend::BackendType;

 mod credentials;
-pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
+pub use credentials::{check_peer_addr_is_in_list, ClientCredentials};

 mod password_hack;
 pub use password_hack::parse_endpoint_param;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -8,27 +8,26 @@ use tokio_postgres::config::AuthKeys;

 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
-use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::AuthSecret;
-use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
-use crate::proxy::NeonOptions;
 use crate::scram;
 use crate::stream::Stream;
 use crate::{
-    auth::{self, ComputeUserInfoMaybeEndpoint},
+    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
    console::{
        self,
-        provider::{CachedAllowedIps, CachedNodeInfo},
+        provider::{CachedNodeInfo, ConsoleReqExtra},
        Api,
    },
+    metrics::LatencyTimer,
    stream, url,
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
+use std::net::IpAddr;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -39,7 +38,7 @@ use tracing::{error, info, warn};
 /// * When `T` is `()`, it's just a regular auth backend selector
 ///   which we use in [`crate::config::ProxyConfig`].
 ///
-/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
+/// * However, when we substitute `T` with [`ClientCredentials`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
 pub enum BackendType<'a, T> {
@@ -57,7 +56,7 @@ pub enum BackendType<'a, T> {

 pub trait TestBackend: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips(&self) -> Result<Arc<Vec<String>>, console::errors::GetAuthInfoError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -128,23 +127,15 @@ pub struct ComputeCredentials<T> {
    pub keys: T,
 }

-#[derive(Debug, Clone)]
 pub struct ComputeUserInfoNoEndpoint {
    pub user: SmolStr,
-    pub options: NeonOptions,
+    pub peer_addr: IpAddr,
+    pub cache_key: SmolStr,
 }

-#[derive(Debug, Clone)]
 pub struct ComputeUserInfo {
    pub endpoint: SmolStr,
-    pub user: SmolStr,
-    pub options: NeonOptions,
-}
-
-impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> SmolStr {
-        self.options.get_cache_key(&self.endpoint)
-    }
+    pub inner: ComputeUserInfoNoEndpoint,
 }

 pub enum ComputeCredentialKeys {
@@ -153,21 +144,19 @@ pub enum ComputeCredentialKeys {
    AuthKeys(AuthKeys),
 }

-impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
+impl TryFrom<ClientCredentials> for ComputeUserInfo {
    // user name
    type Error = ComputeUserInfoNoEndpoint;

-    fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
-        match user_info.project {
-            None => Err(ComputeUserInfoNoEndpoint {
-                user: user_info.user,
-                options: user_info.options,
-            }),
-            Some(endpoint) => Ok(ComputeUserInfo {
-                endpoint,
-                user: user_info.user,
-                options: user_info.options,
-            }),
+    fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
+        let inner = ComputeUserInfoNoEndpoint {
+            user: creds.user,
+            peer_addr: creds.peer_addr,
+            cache_key: creds.cache_key,
+        };
+        match creds.project {
+            None => Err(inner),
+            Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
        }
    }
 }
@@ -177,53 +166,49 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
+    extra: &ConsoleReqExtra,
+    creds: ClientCredentials,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
-    let (info, unauthenticated_password) = match user_info.try_into() {
+    let (info, unauthenticated_password) = match creds.try_into() {
        Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
-                .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+            let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
            (res.info, Some(res.keys))
        }
        Ok(info) => (info, None),
    };

    info!("fetching user's authentication info");
-    let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+    let allowed_ips = api.get_allowed_ips(extra, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
-    let maybe_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = api.get_role_secret(extra, &info).await?;

-    let cached_secret = maybe_secret.unwrap_or_else(|| {
+    let secret = cached_secret.clone().unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
        // prevent malicious probing (possible due to missing protocol steps).
        // This mocked secret will never lead to successful authentication.
        info!("authentication info not found, mocking it");
-        Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock(
-            &info.user,
-            rand::random(),
-        )))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
    });
    match authenticate_with_secret(
-        ctx,
-        cached_secret.value.clone(),
+        secret,
        info,
        client,
        unauthenticated_password,
        allow_cleartext,
        config,
+        latency_timer,
    )
    .await
    {
@@ -239,13 +224,13 @@ async fn auth_quirks(
 }

 async fn authenticate_with_secret(
-    ctx: &mut RequestMonitoring,
    secret: AuthSecret,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    unauthenticated_password: Option<Vec<u8>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    if let Some(password) = unauthenticated_password {
        let auth_outcome = validate_password_and_exchange(&password, secret)?;
@@ -253,7 +238,7 @@ async fn authenticate_with_secret(
            crate::sasl::Outcome::Success(key) => key,
            crate::sasl::Outcome::Failure(reason) => {
                info!("auth backend failed with an error: {reason}");
-                return Err(auth::AuthError::auth_failed(&*info.user));
+                return Err(auth::AuthError::auth_failed(&*info.inner.user));
            }
        };

@@ -268,29 +253,38 @@ async fn authenticate_with_secret(
    // Perform cleartext auth if we're allowed to do that.
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
+        return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
+    classic::authenticate(info, client, config, latency_timer, secret).await
 }

 /// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
 /// only if authentication was successfuly.
 async fn auth_and_wake_compute(
-    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
+    extra: &ConsoleReqExtra,
+    creds: ClientCredentials,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
+    let compute_credentials = auth_quirks(
+        api,
+        extra,
+        creds,
+        client,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await?;

    let mut num_retries = 0;
    let mut node = loop {
-        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
+        let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -307,8 +301,6 @@ async fn auth_and_wake_compute(
        tokio::time::sleep(wait_duration).await;
    };

-    ctx.set_project(node.aux.clone());
-
    match compute_credentials.keys {
        #[cfg(feature = "testing")]
        ComputeCredentialKeys::Password(password) => node.config.password(password),
@@ -318,15 +310,15 @@ async fn auth_and_wake_compute(
    Ok((node, compute_credentials.info))
 }

-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
+impl<'a> BackendType<'a, ClientCredentials> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<SmolStr> {
        use BackendType::*;

        match self {
-            Console(_, user_info) => user_info.project.clone(),
+            Console(_, creds) => creds.project.clone(),
            #[cfg(feature = "testing")]
-            Postgres(_, user_info) => user_info.project.clone(),
+            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".into()),
            #[cfg(test)]
            Test(_) => Some("test".into()),
@@ -338,9 +330,9 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
        use BackendType::*;

        match self {
-            Console(_, user_info) => &user_info.user,
+            Console(_, creds) => &creds.user,
            #[cfg(feature = "testing")]
-            Postgres(_, user_info) => &user_info.user,
+            Postgres(_, creds) => &creds.user,
            Link(_) => "link",
            #[cfg(test)]
            Test(_) => "test",
@@ -351,37 +343,52 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        self,
-        ctx: &mut RequestMonitoring,
+        extra: &ConsoleReqExtra,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
+        latency_timer: &mut LatencyTimer,
    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
        use BackendType::*;

        let res = match self {
-            Console(api, user_info) => {
+            Console(api, creds) => {
                info!(
-                    user = &*user_info.user,
-                    project = user_info.project(),
+                    user = &*creds.user,
+                    project = creds.project(),
                    "performing authentication using the console"
                );

-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
+                    extra,
+                    creds,
+                    client,
+                    allow_cleartext,
+                    config,
+                    latency_timer,
+                )
+                .await?;
                (cache_info, BackendType::Console(api, user_info))
            }
            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => {
+            Postgres(api, creds) => {
                info!(
-                    user = &*user_info.user,
-                    project = user_info.project(),
+                    user = &*creds.user,
+                    project = creds.project(),
                    "performing authentication using a local postgres instance"
                );

-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
+                    extra,
+                    creds,
+                    client,
+                    allow_cleartext,
+                    config,
+                    latency_timer,
+                )
+                .await?;
                (cache_info, BackendType::Postgres(api, user_info))
            }
            // NOTE: this auth backend doesn't use client credentials.
@@ -409,16 +416,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 impl BackendType<'_, ComputeUserInfo> {
    pub async fn get_allowed_ips(
        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        extra: &ConsoleReqExtra,
+    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Console(api, creds) => api.get_allowed_ips(extra, creds).await,
            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+            Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
+            Link(_) => Ok(Arc::new(vec![])),
            #[cfg(test)]
-            Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
+            Test(x) => x.get_allowed_ips(),
        }
    }

@@ -426,14 +433,14 @@ impl BackendType<'_, ComputeUserInfo> {
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
    pub async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        extra: &ConsoleReqExtra,
    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
        use BackendType::*;

        match self {
-            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
+            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
+            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
            #[cfg(test)]
            Test(x) => x.wake_compute().map(Some),
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -54,7 +54,7 @@ pub(super) async fn authenticate(
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(&*creds.user));
+                    return Err(auth::AuthError::auth_failed(&*creds.inner.user));
                }
            };

--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -36,7 +36,7 @@ pub async fn authenticate_cleartext(
        sasl::Outcome::Success(key) => key,
        sasl::Outcome::Failure(reason) => {
            info!("auth backend failed with an error: {reason}");
-            return Err(auth::AuthError::auth_failed(&*info.user));
+            return Err(auth::AuthError::auth_failed(&*info.inner.user));
        }
    };

@@ -67,8 +67,7 @@ pub async fn password_hack_no_authentication(
    // Report tentative success; compute node will check the password anyway.
    Ok(ComputeCredentials {
        info: ComputeUserInfo {
-            user: info.user,
-            options: info.options,
+            inner: info,
            endpoint: payload.endpoint,
        },
        keys: payload.password,
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,8 @@
 //! User credentials used in authentication.

 use crate::{
-    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
+    auth::password_hack::parse_endpoint_param, error::UserFacingError,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -12,7 +12,7 @@ use thiserror::Error;
 use tracing::{info, warn};

 #[derive(Debug, Error, PartialEq, Eq, Clone)]
-pub enum ComputeUserInfoParseError {
+pub enum ClientCredsParseError {
    #[error("Parameter '{0}' is missing in startup packet.")]
    MissingKey(&'static str),

@@ -33,58 +33,39 @@ pub enum ComputeUserInfoParseError {
    MalformedProjectName(SmolStr),
 }

-impl UserFacingError for ComputeUserInfoParseError {}
+impl UserFacingError for ClientCredsParseError {}

 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ComputeUserInfoMaybeEndpoint {
+pub struct ClientCredentials {
    pub user: SmolStr,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<SmolStr>,

-    pub options: NeonOptions,
+    pub cache_key: SmolStr,
+    pub peer_addr: IpAddr,
 }

-impl ComputeUserInfoMaybeEndpoint {
+impl ClientCredentials {
    #[inline]
    pub fn project(&self) -> Option<&str> {
        self.project.as_deref()
    }
 }

-pub fn endpoint_sni<'a>(
-    sni: &'a str,
-    common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
-    let Some((subdomain, common_name)) = sni.split_once('.') else {
-        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
-    };
-    if !common_names.contains(common_name) {
-        return Err(ComputeUserInfoParseError::UnknownCommonName {
-            cn: common_name.into(),
-        });
-    }
-    Ok(subdomain)
-}
-
-impl ComputeUserInfoMaybeEndpoint {
+impl ClientCredentials {
    pub fn parse(
-        ctx: &mut RequestMonitoring,
        params: &StartupMessageParams,
        sni: Option<&str>,
-        common_names: Option<&HashSet<String>>,
-    ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
+        common_names: Option<HashSet<String>>,
+        peer_addr: IpAddr,
+    ) -> Result<Self, ClientCredsParseError> {
+        use ClientCredsParseError::*;

        // Some parameters are stored in the startup message.
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: SmolStr = get_param("user")?.into();
-
-        // record the values if we have them
-        ctx.set_application(params.get("application_name").map(SmolStr::from));
-        ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(SmolStr::from));
+        let user = get_param("user")?.into();

        // Project name might be passed via PG's command-line options.
        let project_option = params
@@ -102,7 +83,21 @@ impl ComputeUserInfoMaybeEndpoint {

        let project_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
-                Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
+                let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
+
+                let project = common_name_from_sni
+                    .and_then(|domain| {
+                        if cn.contains(domain) {
+                            subdomain_from_sni(sni_str, domain)
+                        } else {
+                            None
+                        }
+                    })
+                    .ok_or_else(|| UnknownCommonName {
+                        cn: common_name_from_sni.unwrap_or("").into(),
+                    })?;
+
+                Some(project)
            } else {
                None
            }
@@ -141,17 +136,23 @@ impl ComputeUserInfoMaybeEndpoint {
            info!("Connection with password hack");
        }

-        let options = NeonOptions::parse_params(params);
+        let cache_key = format!(
+            "{}{}",
+            project.as_deref().unwrap_or(""),
+            neon_options_str(params)
+        )
+        .into();

        Ok(Self {
            user,
            project,
-            options,
+            cache_key,
+            peer_addr,
        })
    }
 }

-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
+pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<String>) -> bool {
    if ip_list.is_empty() {
        return true;
    }
@@ -203,19 +204,25 @@ fn project_name_valid(name: &str) -> bool {
    name.chars().all(|c| c.is_alphanumeric() || c == '-')
 }

+fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
+    sni.strip_suffix(common_name)?
+        .strip_suffix('.')
+        .map(SmolStr::from)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use ComputeUserInfoParseError::*;
+    use ClientCredsParseError::*;

    #[test]
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project, None);

        Ok(())
    }
@@ -227,10 +234,10 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project, None);

        Ok(())
    }
@@ -242,12 +249,11 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("foo"));
-        assert_eq!(user_info.options.get_cache_key("foo"), "foo");
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("foo"));
+        assert_eq!(creds.cache_key, "foo");

        Ok(())
    }
@@ -259,10 +265,10 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));

        Ok(())
    }
@@ -274,10 +280,10 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));

        Ok(())
    }
@@ -292,10 +298,10 @@ mod tests {
            ),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());

        Ok(())
    }
@@ -307,10 +313,10 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
-        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());

        Ok(())
    }
@@ -322,11 +328,10 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("baz"));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("baz"));

        Ok(())
    }
@@ -337,17 +342,15 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let mut ctx = RequestMonitoring::test();
-        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let mut ctx = RequestMonitoring::test();
-        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));

        Ok(())
    }
@@ -360,10 +363,9 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+            .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -380,10 +382,9 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+            .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -401,14 +402,10 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let mut ctx = RequestMonitoring::test();
-        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("project"));
-        assert_eq!(
-            user_info.options.get_cache_key("project"),
-            "project endpoint_type:read_write lsn:0/2"
-        );
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        assert_eq!(creds.project.as_deref(), Some("project"));
+        assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");

        Ok(())
    }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -8,7 +8,6 @@ use std::{net::SocketAddr, sync::Arc};
 use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
-use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;

@@ -171,16 +170,7 @@ async fn task_main(
                    .context("failed to set socket option")?;

                info!(%peer_addr, "serving");
-                let mut ctx =
-                    RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
-                handle_client(
-                    &mut ctx,
-                    dest_suffix,
-                    tls_config,
-                    tls_server_end_point,
-                    socket,
-                )
-                .await
+                handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
            }
            .unwrap_or_else(|e| {
                // Acknowledge that the task has finished with an error.
@@ -246,7 +236,6 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }

 async fn handle_client(
-    ctx: &mut RequestMonitoring,
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
@@ -272,5 +261,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -3,14 +3,14 @@ use proxy::auth;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
-use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
-use proxy::context::parquet::ParquetUploadArgs;
+use proxy::console::provider::AllowedIpsCache;
+use proxy::console::provider::NodeInfoCache;
+use proxy::console::provider::RoleSecretCache;
 use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
-use proxy::redis::notifications;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;

@@ -44,9 +44,6 @@ enum AuthBackend {
 #[derive(Parser)]
 #[command(version = GIT_VERSION, about)]
 struct ProxyCliArgs {
-    /// Name of the region this proxy is deployed in
-    #[clap(long, default_value_t = String::new())]
-    region: String,
    /// listen for incoming client connections on ip:port
    #[clap(short, long, default_value = "127.0.0.1:4432")]
    proxy: String,
@@ -136,15 +133,6 @@ struct ProxyCliArgs {
    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_ip_check_for_http: bool,
-    /// redis url for notifications.
-    #[clap(long)]
-    redis_notifications: Option<String>,
-    /// cache for `project_info` (use `size=0` to disable)
-    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
-    project_info_cache: String,
-
-    #[clap(flatten)]
-    parquet_upload: ParquetUploadArgs,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -233,11 +221,6 @@ async fn main() -> anyhow::Result<()> {
        ));
    }

-    client_tasks.spawn(proxy::context::parquet::worker(
-        cancellation_token.clone(),
-        args.parquet_upload,
-    ));
-
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
@@ -248,15 +231,6 @@ async fn main() -> anyhow::Result<()> {
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
    }

-    if let auth::BackendType::Console(api, _) = &config.auth_backend {
-        let cache = api.caches.project_info.clone();
-        if let Some(url) = args.redis_notifications {
-            info!("Starting redis notifications listener ({url})");
-            maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
-        }
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-    }
-
    let maintenance = loop {
        // get one complete task
        match futures::future::select(
@@ -322,17 +296,32 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    let auth_backend = match &args.auth_backend {
        AuthBackend::Console => {
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
+            let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
+            let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;

            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-            )));
+            info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
+            info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
+            let caches = Box::leak(Box::new(console::caches::ApiCaches {
+                node_info: NodeInfoCache::new(
+                    "node_info_cache",
+                    wake_compute_cache_config.size,
+                    wake_compute_cache_config.ttl,
+                    true,
+                ),
+                allowed_ips: AllowedIpsCache::new(
+                    "allowed_ips_cache",
+                    allowed_ips_cache_config.size,
+                    allowed_ips_cache_config.ttl,
+                    false,
+                ),
+                role_secret: RoleSecretCache::new(
+                    "role_secret_cache",
+                    role_secret_cache_config.size,
+                    role_secret_cache_config.ttl,
+                    false,
+                ),
+            }));

            let config::WakeComputeLockOptions {
                shards,
@@ -391,8 +380,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        require_client_ip: args.require_client_ip,
        disable_ip_check_for_http: args.disable_ip_check_for_http,
        endpoint_rps_limit,
-        // TODO: add this argument
-        region: args.region.clone(),
    }));

    Ok(config)
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,6 +1,311 @@
-pub mod common;
-pub mod project_info;
-mod timed_lru;
+use std::{
+    borrow::Borrow,
+    hash::Hash,
+    ops::{Deref, DerefMut},
+    time::{Duration, Instant},
+};
+use tracing::debug;
+
+// This seems to make more sense than `lru` or `cached`:
+//
+// * `near/nearcore` ditched `cached` in favor of `lru`
+//   (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed).
+//
+// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs).
+//   This severely hinders its usage both in terms of creating wrappers and supported key types.
+//
+// On the other hand, `hashlink` has good download stats and appears to be maintained.
+use hashlink::{linked_hash_map::RawEntryMut, LruCache};
+
+/// A generic trait which exposes types of cache's key and value,
+/// as well as the notion of cache entry invalidation.
+/// This is useful for [`timed_lru::Cached`].
+pub trait Cache {
+    /// Entry's key.
+    type Key;
+
+    /// Entry's value.
+    type Value;
+
+    /// Used for entry invalidation.
+    type LookupInfo<Key>;
+
+    /// Invalidate an entry using a lookup info.
+    /// We don't have an empty default impl because it's error-prone.
+    fn invalidate(&self, _: &Self::LookupInfo<Self::Key>);
+}
+
+impl<C: Cache> Cache for &C {
+    type Key = C::Key;
+    type Value = C::Value;
+    type LookupInfo<Key> = C::LookupInfo<Key>;
+
+    fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
+        C::invalidate(self, info)
+    }
+}

-pub use common::{Cache, Cached};
 pub use timed_lru::TimedLru;
+pub mod timed_lru {
+    use super::*;
+
+    /// An implementation of timed LRU cache with fixed capacity.
+    /// Key properties:
+    ///
+    /// * Whenever a new entry is inserted, the least recently accessed one is evicted.
+    ///   The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
+    ///
+    /// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
+    ///   If the entry has expired, we remove it from the cache; Otherwise we bump the
+    ///   expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
+    ///   its existence.
+    ///
+    /// * There's an API for immediate invalidation (removal) of a cache entry;
+    ///   It's useful in case we know for sure that the entry is no longer correct.
+    ///   See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
+    ///
+    /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
+    ///   or by a successful lookup (i.e. the entry hasn't expired yet).
+    ///   There is no background job to reap the expired records.
+    ///
+    /// * It's possible for an entry that has not yet expired entry to be evicted
+    ///   before expired items. That's a bit wasteful, but probably fine in practice.
+    pub struct TimedLru<K, V> {
+        /// Cache's name for tracing.
+        name: &'static str,
+
+        /// The underlying cache implementation.
+        cache: parking_lot::Mutex<LruCache<K, Entry<V>>>,
+
+        /// Default time-to-live of a single entry.
+        ttl: Duration,
+
+        update_ttl_on_retrieval: bool,
+    }
+
+    impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
+        type Key = K;
+        type Value = V;
+        type LookupInfo<Key> = LookupInfo<Key>;
+
+        fn invalidate(&self, info: &Self::LookupInfo<K>) {
+            self.invalidate_raw(info)
+        }
+    }
+
+    struct Entry<T> {
+        created_at: Instant,
+        expires_at: Instant,
+        value: T,
+    }
+
+    impl<K: Hash + Eq, V> TimedLru<K, V> {
+        /// Construct a new LRU cache with timed entries.
+        pub fn new(
+            name: &'static str,
+            capacity: usize,
+            ttl: Duration,
+            update_ttl_on_retrieval: bool,
+        ) -> Self {
+            Self {
+                name,
+                cache: LruCache::new(capacity).into(),
+                ttl,
+                update_ttl_on_retrieval,
+            }
+        }
+
+        /// Drop an entry from the cache if it's outdated.
+        #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+        fn invalidate_raw(&self, info: &LookupInfo<K>) {
+            let now = Instant::now();
+
+            // Do costly things before taking the lock.
+            let mut cache = self.cache.lock();
+            let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
+                RawEntryMut::Vacant(_) => return,
+                RawEntryMut::Occupied(x) => x,
+            };
+
+            // Remove the entry if it was created prior to lookup timestamp.
+            let entry = raw_entry.get();
+            let (created_at, expires_at) = (entry.created_at, entry.expires_at);
+            let should_remove = created_at <= info.created_at || expires_at <= now;
+
+            if should_remove {
+                raw_entry.remove();
+            }
+
+            drop(cache); // drop lock before logging
+            debug!(
+                created_at = format_args!("{created_at:?}"),
+                expires_at = format_args!("{expires_at:?}"),
+                entry_removed = should_remove,
+                "processed a cache entry invalidation event"
+            );
+        }
+
+        /// Try retrieving an entry by its key, then execute `extract` if it exists.
+        #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+        fn get_raw<Q, R>(&self, key: &Q, extract: impl FnOnce(&K, &Entry<V>) -> R) -> Option<R>
+        where
+            K: Borrow<Q>,
+            Q: Hash + Eq + ?Sized,
+        {
+            let now = Instant::now();
+            let deadline = now.checked_add(self.ttl).expect("time overflow");
+
+            // Do costly things before taking the lock.
+            let mut cache = self.cache.lock();
+            let mut raw_entry = match cache.raw_entry_mut().from_key(key) {
+                RawEntryMut::Vacant(_) => return None,
+                RawEntryMut::Occupied(x) => x,
+            };
+
+            // Immeditely drop the entry if it has expired.
+            let entry = raw_entry.get();
+            if entry.expires_at <= now {
+                raw_entry.remove();
+                return None;
+            }
+
+            let value = extract(raw_entry.key(), entry);
+            let (created_at, expires_at) = (entry.created_at, entry.expires_at);
+
+            // Update the deadline and the entry's position in the LRU list.
+            if self.update_ttl_on_retrieval {
+                raw_entry.get_mut().expires_at = deadline;
+            }
+            raw_entry.to_back();
+
+            drop(cache); // drop lock before logging
+            debug!(
+                created_at = format_args!("{created_at:?}"),
+                old_expires_at = format_args!("{expires_at:?}"),
+                new_expires_at = format_args!("{deadline:?}"),
+                "accessed a cache entry"
+            );
+
+            Some(value)
+        }
+
+        /// Insert an entry to the cache. If an entry with the same key already
+        /// existed, return the previous value and its creation timestamp.
+        #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+        fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
+            let created_at = Instant::now();
+            let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
+
+            let entry = Entry {
+                created_at,
+                expires_at,
+                value,
+            };
+
+            // Do costly things before taking the lock.
+            let old = self
+                .cache
+                .lock()
+                .insert(key, entry)
+                .map(|entry| entry.value);
+
+            debug!(
+                created_at = format_args!("{created_at:?}"),
+                expires_at = format_args!("{expires_at:?}"),
+                replaced = old.is_some(),
+                "created a cache entry"
+            );
+
+            (created_at, old)
+        }
+    }
+
+    impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
+        pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
+            let (created_at, old) = self.insert_raw(key.clone(), value.clone());
+
+            let cached = Cached {
+                token: Some((self, LookupInfo { created_at, key })),
+                value,
+            };
+
+            (old, cached)
+        }
+    }
+
+    impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
+        /// Retrieve a cached entry in convenient wrapper.
+        pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+        where
+            K: Borrow<Q> + Clone,
+            Q: Hash + Eq + ?Sized,
+        {
+            self.get_raw(key, |key, entry| {
+                let info = LookupInfo {
+                    created_at: entry.created_at,
+                    key: key.clone(),
+                };
+
+                Cached {
+                    token: Some((self, info)),
+                    value: entry.value.clone(),
+                }
+            })
+        }
+    }
+
+    /// Lookup information for key invalidation.
+    pub struct LookupInfo<K> {
+        /// Time of creation of a cache [`Entry`].
+        /// We use this during invalidation lookups to prevent eviction of a newer
+        /// entry sharing the same key (it might've been inserted by a different
+        /// task after we got the entry we're trying to invalidate now).
+        created_at: Instant,
+
+        /// Search by this key.
+        key: K,
+    }
+
+    /// Wrapper for convenient entry invalidation.
+    pub struct Cached<C: Cache> {
+        /// Cache + lookup info.
+        token: Option<(C, C::LookupInfo<C::Key>)>,
+
+        /// The value itself.
+        value: C::Value,
+    }
+
+    impl<C: Cache> Cached<C> {
+        /// Place any entry into this wrapper; invalidation will be a no-op.
+        pub fn new_uncached(value: C::Value) -> Self {
+            Self { token: None, value }
+        }
+
+        /// Drop this entry from a cache if it's still there.
+        pub fn invalidate(self) -> C::Value {
+            if let Some((cache, info)) = &self.token {
+                cache.invalidate(info);
+            }
+            self.value
+        }
+
+        /// Tell if this entry is actually cached.
+        pub fn cached(&self) -> bool {
+            self.token.is_some()
+        }
+    }
+
+    impl<C: Cache> Deref for Cached<C> {
+        type Target = C::Value;
+
+        fn deref(&self) -> &Self::Target {
+            &self.value
+        }
+    }
+
+    impl<C: Cache> DerefMut for Cached<C> {
+        fn deref_mut(&mut self) -> &mut Self::Target {
+            &mut self.value
+        }
+    }
+}
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -1,72 +0,0 @@
-use std::ops::{Deref, DerefMut};
-
-/// A generic trait which exposes types of cache's key and value,
-/// as well as the notion of cache entry invalidation.
-/// This is useful for [`Cached`].
-pub trait Cache {
-    /// Entry's key.
-    type Key;
-
-    /// Entry's value.
-    type Value;
-
-    /// Used for entry invalidation.
-    type LookupInfo<Key>;
-
-    /// Invalidate an entry using a lookup info.
-    /// We don't have an empty default impl because it's error-prone.
-    fn invalidate(&self, _: &Self::LookupInfo<Self::Key>);
-}
-
-impl<C: Cache> Cache for &C {
-    type Key = C::Key;
-    type Value = C::Value;
-    type LookupInfo<Key> = C::LookupInfo<Key>;
-
-    fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
-    }
-}
-
-/// Wrapper for convenient entry invalidation.
-pub struct Cached<C: Cache, V = <C as Cache>::Value> {
-    /// Cache + lookup info.
-    pub token: Option<(C, C::LookupInfo<C::Key>)>,
-
-    /// The value itself.
-    pub value: V,
-}
-
-impl<C: Cache, V> Cached<C, V> {
-    /// Place any entry into this wrapper; invalidation will be a no-op.
-    pub fn new_uncached(value: V) -> Self {
-        Self { token: None, value }
-    }
-
-    /// Drop this entry from a cache if it's still there.
-    pub fn invalidate(self) -> V {
-        if let Some((cache, info)) = &self.token {
-            cache.invalidate(info);
-        }
-        self.value
-    }
-
-    /// Tell if this entry is actually cached.
-    pub fn cached(&self) -> bool {
-        self.token.is_some()
-    }
-}
-
-impl<C: Cache, V> Deref for Cached<C, V> {
-    type Target = V;
-
-    fn deref(&self) -> &Self::Target {
-        &self.value
-    }
-}
-
-impl<C: Cache, V> DerefMut for Cached<C, V> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.value
-    }
-}
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,496 +0,0 @@
-use std::{
-    collections::HashSet,
-    convert::Infallible,
-    sync::{atomic::AtomicU64, Arc},
-    time::Duration,
-};
-
-use dashmap::DashMap;
-use rand::{thread_rng, Rng};
-use smol_str::SmolStr;
-use tokio::time::Instant;
-use tracing::{debug, info};
-
-use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
-
-use super::{Cache, Cached};
-
-pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
-    fn enable_ttl(&self);
-    fn disable_ttl(&self);
-}
-
-struct Entry<T> {
-    created_at: Instant,
-    value: T,
-}
-
-impl<T> Entry<T> {
-    pub fn new(value: T) -> Self {
-        Self {
-            created_at: Instant::now(),
-            value,
-        }
-    }
-}
-
-impl<T> From<T> for Entry<T> {
-    fn from(value: T) -> Self {
-        Self::new(value)
-    }
-}
-
-#[derive(Default)]
-struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<AuthSecret>>,
-    allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
-}
-
-impl EndpointInfo {
-    fn check_ignore_cache(ignore_cache_since: Option<Instant>, created_at: Instant) -> bool {
-        match ignore_cache_since {
-            None => false,
-            Some(t) => t < created_at,
-        }
-    }
-    pub fn get_role_secret(
-        &self,
-        role_name: &SmolStr,
-        valid_since: Instant,
-        ignore_cache_since: Option<Instant>,
-    ) -> Option<(AuthSecret, bool)> {
-        if let Some(secret) = self.secret.get(role_name) {
-            if valid_since < secret.created_at {
-                return Some((
-                    secret.value.clone(),
-                    Self::check_ignore_cache(ignore_cache_since, secret.created_at),
-                ));
-            }
-        }
-        None
-    }
-
-    pub fn get_allowed_ips(
-        &self,
-        valid_since: Instant,
-        ignore_cache_since: Option<Instant>,
-    ) -> Option<(Arc<Vec<SmolStr>>, bool)> {
-        if let Some(allowed_ips) = &self.allowed_ips {
-            if valid_since < allowed_ips.created_at {
-                return Some((
-                    allowed_ips.value.clone(),
-                    Self::check_ignore_cache(ignore_cache_since, allowed_ips.created_at),
-                ));
-            }
-        }
-        None
-    }
-    pub fn invalidate_allowed_ips(&mut self) {
-        self.allowed_ips = None;
-    }
-    pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
-        self.secret.remove(role_name);
-    }
-}
-
-/// Cache for project info.
-/// This is used to cache auth data for endpoints.
-/// Invalidation is done by console notifications or by TTL (if console notifications are disabled).
-///
-/// We also store endpoint-to-project mapping in the cache, to be able to access per-endpoint data.
-/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
-/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
-pub struct ProjectInfoCacheImpl {
-    cache: DashMap<SmolStr, EndpointInfo>,
-
-    project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
-    config: ProjectInfoCacheOptions,
-
-    start_time: Instant,
-    ttl_disabled_since_us: AtomicU64,
-}
-
-impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
-        info!("invalidating allowed ips for project `{}`", project_id);
-        let endpoints = self
-            .project2ep
-            .get(project_id)
-            .map(|kv| kv.value().clone())
-            .unwrap_or_default();
-        for endpoint_id in endpoints {
-            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_allowed_ips();
-            }
-        }
-    }
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
-        info!(
-            "invalidating role secret for project_id `{}` and role_name `{}`",
-            project_id, role_name
-        );
-        let endpoints = self
-            .project2ep
-            .get(project_id)
-            .map(|kv| kv.value().clone())
-            .unwrap_or_default();
-        for endpoint_id in endpoints {
-            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_role_secret(role_name);
-            }
-        }
-    }
-    fn enable_ttl(&self) {
-        self.ttl_disabled_since_us
-            .store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
-    }
-
-    fn disable_ttl(&self) {
-        let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-        self.ttl_disabled_since_us
-            .store(new_ttl, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-impl ProjectInfoCacheImpl {
-    pub fn new(config: ProjectInfoCacheOptions) -> Self {
-        Self {
-            cache: DashMap::new(),
-            project2ep: DashMap::new(),
-            config,
-            ttl_disabled_since_us: AtomicU64::new(u64::MAX),
-            start_time: Instant::now(),
-        }
-    }
-
-    pub fn get_role_secret(
-        &self,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-    ) -> Option<Cached<&Self, AuthSecret>> {
-        let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
-        let (value, ignore_cache) =
-            endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
-        if !ignore_cache {
-            let cached = Cached {
-                token: Some((
-                    self,
-                    CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
-                )),
-                value,
-            };
-            return Some(cached);
-        }
-        Some(Cached::new_uncached(value))
-    }
-    pub fn get_allowed_ips(
-        &self,
-        endpoint_id: &SmolStr,
-    ) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
-        let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
-        let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
-        let (value, ignore_cache) = value?;
-        if !ignore_cache {
-            let cached = Cached {
-                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
-                value,
-            };
-            return Some(cached);
-        }
-        Some(Cached::new_uncached(value))
-    }
-    pub fn insert_role_secret(
-        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-        secret: AuthSecret,
-    ) {
-        if self.cache.len() >= self.config.size {
-            // If there are too many entries, wait until the next gc cycle.
-            return;
-        }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
-        if entry.secret.len() < self.config.max_roles {
-            entry.secret.insert(role_name.clone(), secret.into());
-        }
-    }
-    pub fn insert_allowed_ips(
-        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        allowed_ips: Arc<Vec<SmolStr>>,
-    ) {
-        if self.cache.len() >= self.config.size {
-            // If there are too many entries, wait until the next gc cycle.
-            return;
-        }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        self.cache
-            .entry(endpoint_id.clone())
-            .or_default()
-            .allowed_ips = Some(allowed_ips.into());
-    }
-    fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
-        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
-            endpoints.insert(endpoint_id.clone());
-        } else {
-            self.project2ep
-                .insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
-        }
-    }
-    fn get_cache_times(&self) -> (Instant, Option<Instant>) {
-        let mut valid_since = Instant::now() - self.config.ttl;
-        // Only ignore cache if ttl is disabled.
-        let ttl_disabled_since_us = self
-            .ttl_disabled_since_us
-            .load(std::sync::atomic::Ordering::Relaxed);
-        let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
-            let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
-            // We are fine if entry is not older than ttl or was added before we are getting notifications.
-            valid_since = valid_since.min(ignore_cache_since);
-            Some(ignore_cache_since)
-        } else {
-            None
-        };
-        (valid_since, ignore_cache_since)
-    }
-
-    pub async fn gc_worker(&self) -> anyhow::Result<Infallible> {
-        let mut interval =
-            tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-            if self.cache.len() <= self.config.size {
-                // If there are not too many entries, wait until the next gc cycle.
-                continue;
-            }
-            self.gc();
-        }
-    }
-
-    fn gc(&self) {
-        let shard = thread_rng().gen_range(0..self.project2ep.shards().len());
-        debug!(shard, "project_info_cache: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut removed = 0;
-        let shard = self.project2ep.shards()[shard].write();
-        for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get().iter() {
-                self.cache.remove(endpoint);
-                removed += 1;
-            }
-        }
-        // We can drop this shard only after making sure that all endpoints are removed.
-        drop(shard);
-        info!("project_info_cache: removed {removed} endpoints");
-    }
-}
-
-/// Lookup info for project info cache.
-/// This is used to invalidate cache entries.
-pub struct CachedLookupInfo {
-    /// Search by this key.
-    endpoint_id: SmolStr,
-    lookup_type: LookupType,
-}
-
-impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
-        Self {
-            endpoint_id,
-            lookup_type: LookupType::RoleSecret(role_name),
-        }
-    }
-    pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
-        Self {
-            endpoint_id,
-            lookup_type: LookupType::AllowedIps,
-        }
-    }
-}
-
-enum LookupType {
-    RoleSecret(SmolStr),
-    AllowedIps,
-}
-
-impl Cache for ProjectInfoCacheImpl {
-    type Key = SmolStr;
-    // Value is not really used here, but we need to specify it.
-    type Value = SmolStr;
-
-    type LookupInfo<Key> = CachedLookupInfo;
-
-    fn invalidate(&self, key: &Self::LookupInfo<SmolStr>) {
-        match &key.lookup_type {
-            LookupType::RoleSecret(role_name) => {
-                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
-                    endpoint_info.invalidate_role_secret(role_name);
-                }
-            }
-            LookupType::AllowedIps => {
-                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
-                    endpoint_info.invalidate_allowed_ips();
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::{console::AuthSecret, scram::ServerSecret};
-    use smol_str::SmolStr;
-    use std::{sync::Arc, time::Duration};
-
-    #[tokio::test]
-    async fn test_project_info_cache_settings() {
-        tokio::time::pause();
-        let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
-            size: 2,
-            max_roles: 2,
-            ttl: Duration::from_secs(1),
-            gc_interval: Duration::from_secs(600),
-        });
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
-
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert!(cached.cached());
-        assert_eq!(cached.value, secret1);
-        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert!(cached.cached());
-        assert_eq!(cached.value, secret2);
-
-        // Shouldn't add more than 2 roles.
-        let user3: SmolStr = "user3".into();
-        let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32]));
-        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
-        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
-
-        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
-        assert!(cached.cached());
-        assert_eq!(cached.value, allowed_ips);
-
-        tokio::time::advance(Duration::from_secs(2)).await;
-        let cached = cache.get_role_secret(&endpoint_id, &user1);
-        assert!(cached.is_none());
-        let cached = cache.get_role_secret(&endpoint_id, &user2);
-        assert!(cached.is_none());
-        let cached = cache.get_allowed_ips(&endpoint_id);
-        assert!(cached.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_project_info_cache_invalidations() {
-        tokio::time::pause();
-        let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
-            size: 2,
-            max_roles: 2,
-            ttl: Duration::from_secs(1),
-            gc_interval: Duration::from_secs(600),
-        }));
-        cache.clone().disable_ttl();
-        tokio::time::advance(Duration::from_secs(2)).await;
-
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
-
-        tokio::time::advance(Duration::from_secs(2)).await;
-        // Nothing should be invalidated.
-
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        // TTL is disabled, so it should be impossible to invalidate this value.
-        assert!(!cached.cached());
-        assert_eq!(cached.value, secret1);
-
-        cached.invalidate(); // Shouldn't do anything.
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert_eq!(cached.value, secret1);
-
-        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert!(!cached.cached());
-        assert_eq!(cached.value, secret2);
-
-        // The only way to invalidate this value is to invalidate via the api.
-        cache.invalidate_role_secret_for_project(&project_id, &user2);
-        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
-
-        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
-        assert!(!cached.cached());
-        assert_eq!(cached.value, allowed_ips);
-    }
-
-    #[tokio::test]
-    async fn test_disable_ttl_invalidate_added_before() {
-        tokio::time::pause();
-        let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
-            size: 2,
-            max_roles: 2,
-            ttl: Duration::from_secs(1),
-            gc_interval: Duration::from_secs(600),
-        }));
-
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.clone().disable_ttl();
-        tokio::time::advance(Duration::from_millis(100)).await;
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-
-        // Added before ttl was disabled + ttl should be still cached.
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert!(cached.cached());
-        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert!(cached.cached());
-
-        tokio::time::advance(Duration::from_secs(1)).await;
-        // Added before ttl was disabled + ttl should expire.
-        assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
-        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
-
-        // Added after ttl was disabled + ttl should not be cached.
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
-        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
-        assert!(!cached.cached());
-
-        tokio::time::advance(Duration::from_secs(1)).await;
-        // Added before ttl was disabled + ttl still should expire.
-        assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
-        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
-        // Shouldn't be invalidated.
-
-        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
-        assert!(!cached.cached());
-        assert_eq!(cached.value, allowed_ips);
-    }
-}
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -1,258 +0,0 @@
-use std::{
-    borrow::Borrow,
-    hash::Hash,
-    time::{Duration, Instant},
-};
-use tracing::debug;
-
-// This seems to make more sense than `lru` or `cached`:
-//
-// * `near/nearcore` ditched `cached` in favor of `lru`
-//   (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed).
-//
-// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs).
-//   This severely hinders its usage both in terms of creating wrappers and supported key types.
-//
-// On the other hand, `hashlink` has good download stats and appears to be maintained.
-use hashlink::{linked_hash_map::RawEntryMut, LruCache};
-
-use super::{common::Cached, *};
-
-/// An implementation of timed LRU cache with fixed capacity.
-/// Key properties:
-///
-/// * Whenever a new entry is inserted, the least recently accessed one is evicted.
-///   The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
-///
-/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
-///   If the entry has expired, we remove it from the cache; Otherwise we bump the
-///   expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
-///   its existence.
-///
-/// * There's an API for immediate invalidation (removal) of a cache entry;
-///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
-///
-/// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
-///   or by a successful lookup (i.e. the entry hasn't expired yet).
-///   There is no background job to reap the expired records.
-///
-/// * It's possible for an entry that has not yet expired entry to be evicted
-///   before expired items. That's a bit wasteful, but probably fine in practice.
-pub struct TimedLru<K, V> {
-    /// Cache's name for tracing.
-    name: &'static str,
-
-    /// The underlying cache implementation.
-    cache: parking_lot::Mutex<LruCache<K, Entry<V>>>,
-
-    /// Default time-to-live of a single entry.
-    ttl: Duration,
-
-    update_ttl_on_retrieval: bool,
-}
-
-impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
-    type Key = K;
-    type Value = V;
-    type LookupInfo<Key> = LookupInfo<Key>;
-
-    fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
-    }
-}
-
-struct Entry<T> {
-    created_at: Instant,
-    expires_at: Instant,
-    value: T,
-}
-
-impl<K: Hash + Eq, V> TimedLru<K, V> {
-    /// Construct a new LRU cache with timed entries.
-    pub fn new(
-        name: &'static str,
-        capacity: usize,
-        ttl: Duration,
-        update_ttl_on_retrieval: bool,
-    ) -> Self {
-        Self {
-            name,
-            cache: LruCache::new(capacity).into(),
-            ttl,
-            update_ttl_on_retrieval,
-        }
-    }
-
-    /// Drop an entry from the cache if it's outdated.
-    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
-    fn invalidate_raw(&self, info: &LookupInfo<K>) {
-        let now = Instant::now();
-
-        // Do costly things before taking the lock.
-        let mut cache = self.cache.lock();
-        let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
-            RawEntryMut::Vacant(_) => return,
-            RawEntryMut::Occupied(x) => x,
-        };
-
-        // Remove the entry if it was created prior to lookup timestamp.
-        let entry = raw_entry.get();
-        let (created_at, expires_at) = (entry.created_at, entry.expires_at);
-        let should_remove = created_at <= info.created_at || expires_at <= now;
-
-        if should_remove {
-            raw_entry.remove();
-        }
-
-        drop(cache); // drop lock before logging
-        debug!(
-            created_at = format_args!("{created_at:?}"),
-            expires_at = format_args!("{expires_at:?}"),
-            entry_removed = should_remove,
-            "processed a cache entry invalidation event"
-        );
-    }
-
-    /// Try retrieving an entry by its key, then execute `extract` if it exists.
-    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
-    fn get_raw<Q, R>(&self, key: &Q, extract: impl FnOnce(&K, &Entry<V>) -> R) -> Option<R>
-    where
-        K: Borrow<Q>,
-        Q: Hash + Eq + ?Sized,
-    {
-        let now = Instant::now();
-        let deadline = now.checked_add(self.ttl).expect("time overflow");
-
-        // Do costly things before taking the lock.
-        let mut cache = self.cache.lock();
-        let mut raw_entry = match cache.raw_entry_mut().from_key(key) {
-            RawEntryMut::Vacant(_) => return None,
-            RawEntryMut::Occupied(x) => x,
-        };
-
-        // Immeditely drop the entry if it has expired.
-        let entry = raw_entry.get();
-        if entry.expires_at <= now {
-            raw_entry.remove();
-            return None;
-        }
-
-        let value = extract(raw_entry.key(), entry);
-        let (created_at, expires_at) = (entry.created_at, entry.expires_at);
-
-        // Update the deadline and the entry's position in the LRU list.
-        if self.update_ttl_on_retrieval {
-            raw_entry.get_mut().expires_at = deadline;
-        }
-        raw_entry.to_back();
-
-        drop(cache); // drop lock before logging
-        debug!(
-            created_at = format_args!("{created_at:?}"),
-            old_expires_at = format_args!("{expires_at:?}"),
-            new_expires_at = format_args!("{deadline:?}"),
-            "accessed a cache entry"
-        );
-
-        Some(value)
-    }
-
-    /// Insert an entry to the cache. If an entry with the same key already
-    /// existed, return the previous value and its creation timestamp.
-    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
-    fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
-        let created_at = Instant::now();
-        let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
-
-        let entry = Entry {
-            created_at,
-            expires_at,
-            value,
-        };
-
-        // Do costly things before taking the lock.
-        let old = self
-            .cache
-            .lock()
-            .insert(key, entry)
-            .map(|entry| entry.value);
-
-        debug!(
-            created_at = format_args!("{created_at:?}"),
-            expires_at = format_args!("{expires_at:?}"),
-            replaced = old.is_some(),
-            "created a cache entry"
-        );
-
-        (created_at, old)
-    }
-}
-
-impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
-    pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
-        let (created_at, old) = self.insert_raw(key.clone(), value.clone());
-
-        let cached = Cached {
-            token: Some((self, LookupInfo { created_at, key })),
-            value,
-        };
-
-        (old, cached)
-    }
-}
-
-impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
-    /// Retrieve a cached entry in convenient wrapper.
-    pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
-    where
-        K: Borrow<Q> + Clone,
-        Q: Hash + Eq + ?Sized,
-    {
-        self.get_raw(key, |key, entry| {
-            let info = LookupInfo {
-                created_at: entry.created_at,
-                key: key.clone(),
-            };
-
-            Cached {
-                token: Some((self, info)),
-                value: entry.value.clone(),
-            }
-        })
-    }
-
-    /// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
-    pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
-    where
-        K: Borrow<Q>,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache
-            .get(key)
-            .map(|entry| Cached::new_uncached(entry.value.clone()))
-    }
-
-    /// Remove an entry from the cache.
-    pub fn remove<Q>(&self, key: &Q) -> Option<V>
-    where
-        K: Borrow<Q> + Clone,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache.remove(key).map(|entry| entry.value)
-    }
-}
-
-/// Lookup information for key invalidation.
-pub struct LookupInfo<K> {
-    /// Time of creation of a cache [`Entry`].
-    /// We use this during invalidation lookups to prevent eviction of a newer
-    /// entry sharing the same key (it might've been inserted by a different
-    /// task after we got the entry we're trying to invalidate now).
-    created_at: Instant,
-
-    /// Search by this key.
-    key: K,
-}
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,6 @@
 use crate::{
    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
-    proxy::neon_option,
+    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -39,17 +38,7 @@ impl UserFacingError for ConnectionError {
            // This helps us drop irrelevant library-specific prefixes.
            // TODO: propagate severity level and other parameters.
            Postgres(err) => match err.as_db_error() {
-                Some(err) => {
-                    let msg = err.message();
-
-                    if msg.starts_with("unsupported startup parameter: ")
-                        || msg.starts_with("unsupported startup parameter in options: ")
-                    {
-                        format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter")
-                    } else {
-                        msg.to_owned()
-                    }
-                }
+                Some(err) => err.message().to_owned(),
                None => err.to_string(),
            },
            WakeComputeError(err) => err.to_string_client(),
@@ -243,9 +232,9 @@ impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
-        ctx: &mut RequestMonitoring,
        allow_self_signed_compute: bool,
        timeout: Duration,
+        proto: &'static str,
    ) -> Result<PostgresConnection, ConnectionError> {
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;

@@ -279,9 +268,7 @@ impl ConnCfg {
            stream,
            params,
            cancel_closure,
-            _guage: NUM_DB_CONNECTIONS_GAUGE
-                .with_label_values(&[ctx.protocol])
-                .guard(),
+            _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
        };

        Ok(connection)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Arthur Petukhovsky	f2e98b2b0a	Write WAL bytes to data dir	2023-12-28 18:49:51 +00:00
Arthur Petukhovsky	0ba55719b0	Check bytes in segment intersection	2023-12-28 17:55:04 +00:00
Arthur Petukhovsky	b162b4a9cf	Fix clap arg	2023-12-28 17:10:05 +00:00
Arthur Petukhovsky	bbc8da687e	Add dryrun version of the script	2023-12-28 16:55:21 +00:00