Merge pull request #6057 from neondatabase/vk/patch_timescale_for_production

Revert timescaledb for pg14 and pg15 (#6056)
Revert timescaledb for pg14 and pg15 (#6056 )
2026-05-18 05:30:37 +00:00 · 2023-12-06 16:21:16 +01:00 · 2023-12-06 16:14:07 +01:00 · 2023-12-05 17:03:28 +02:00 · 2023-12-04 12:36:19 -08:00 · 2023-12-04 11:41:27 +02:00
178 changed files with 4312 additions and 8224 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,10 +199,6 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done

      - name: Checkout
        uses: actions/checkout@v3
@@ -1101,10 +1097,6 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done

      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,10 +142,6 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done

      - name: Checkout
        uses: actions/checkout@v4
@@ -242,20 +238,6 @@ jobs:
      options: --init

    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
      - name: Checkout
        uses: actions/checkout@v4
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,3 @@ test_output/
 *.o
 *.so
 *.Po
-
-# pgindent typedef lists
-*.list
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -44,12 +44,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "allocator-api2"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -184,7 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
 dependencies = [
 "concurrent-queue",
- "event-listener 2.5.3",
+ "event-listener",
 "futures-core",
 ]

@@ -205,13 +199,11 @@ dependencies = [

 [[package]]
 name = "async-lock"
-version = "3.2.0"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
+checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
 dependencies = [
- "event-listener 4.0.0",
- "event-listener-strategy",
- "pin-project-lite",
+ "event-listener",
 ]

 [[package]]
@@ -694,9 +686,9 @@ dependencies = [

 [[package]]
 name = "azure_core"
-version = "0.18.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
+checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467"
 dependencies = [
 "async-trait",
 "base64 0.21.1",
@@ -704,10 +696,8 @@ dependencies = [
 "dyn-clone",
 "futures",
 "getrandom 0.2.11",
- "hmac",
 "http-types",
 "log",
- "once_cell",
 "paste",
 "pin-project",
 "quick-xml",
@@ -716,7 +706,6 @@ dependencies = [
 "rustc_version",
 "serde",
 "serde_json",
- "sha2",
 "time",
 "url",
 "uuid",
@@ -724,9 +713,9 @@ dependencies = [

 [[package]]
 name = "azure_identity"
-version = "0.18.1"
+version = "0.16.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
+checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -736,6 +725,7 @@ dependencies = [
 "oauth2",
 "pin-project",
 "serde",
+ "serde_json",
 "time",
 "tz-rs",
 "url",
@@ -744,18 +734,21 @@ dependencies = [

 [[package]]
 name = "azure_storage"
-version = "0.18.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
+checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97"
 dependencies = [
 "RustyXML",
- "async-lock",
 "async-trait",
 "azure_core",
 "bytes",
+ "futures",
+ "hmac",
 "log",
 "serde",
 "serde_derive",
+ "serde_json",
+ "sha2",
 "time",
 "url",
 "uuid",
@@ -763,14 +756,13 @@ dependencies = [

 [[package]]
 name = "azure_storage_blobs"
-version = "0.18.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
+checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4"
 dependencies = [
 "RustyXML",
 "azure_core",
 "azure_storage",
- "azure_svc_blobstorage",
 "bytes",
 "futures",
 "log",
@@ -782,22 +774,6 @@ dependencies = [
 "uuid",
 ]

-[[package]]
-name = "azure_svc_blobstorage"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
-dependencies = [
- "azure_core",
- "bytes",
- "futures",
- "log",
- "once_cell",
- "serde",
- "serde_json",
- "time",
-]
-
 [[package]]
 name = "backtrace"
 version = "0.3.67"
@@ -914,7 +890,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
 "memchr",
 "once_cell",
- "regex-automata 0.1.10",
+ "regex-automata",
 "serde",
 ]

@@ -1704,27 +1680,6 @@ version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"

-[[package]]
-name = "event-listener"
-version = "4.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
-dependencies = [
- "concurrent-queue",
- "parking",
- "pin-project-lite",
-]
-
-[[package]]
-name = "event-listener-strategy"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
-dependencies = [
- "event-listener 4.0.0",
- "pin-project-lite",
-]
-
 [[package]]
 name = "fail"
 version = "0.5.1"
@@ -2087,10 +2042,6 @@ name = "hashbrown"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
-dependencies = [
- "ahash",
- "allocator-api2",
-]

 [[package]]
 name = "hashlink"
@@ -2582,7 +2533,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
 dependencies = [
- "regex-automata 0.1.10",
+ "regex-automata",
 ]

 [[package]]
@@ -2608,9 +2559,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "memchr"
-version = "2.6.4"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"

 [[package]]
 name = "memoffset"
@@ -2683,14 +2634,14 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
 dependencies = [
 "libc",
 "log",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -3103,7 +3054,6 @@ dependencies = [
 "humantime-serde",
 "hyper",
 "itertools",
- "md5",
 "metrics",
 "nix 0.26.2",
 "num-traits",
@@ -3694,7 +3644,7 @@ dependencies = [
 "serde_json",
 "sha2",
 "smol_str",
- "socket2 0.5.5",
+ "socket2 0.5.3",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -3718,9 +3668,9 @@ dependencies = [

 [[package]]
 name = "quick-xml"
-version = "0.31.0"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
+checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
 dependencies = [
 "memchr",
 "serde",
@@ -3860,14 +3810,13 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.10.2"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
 dependencies = [
 "aho-corasick",
 "memchr",
- "regex-automata 0.4.3",
- "regex-syntax 0.8.2",
+ "regex-syntax 0.7.2",
 ]

 [[package]]
@@ -3879,17 +3828,6 @@ dependencies = [
 "regex-syntax 0.6.29",
 ]

-[[package]]
-name = "regex-automata"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
-dependencies = [
- "aho-corasick",
- "memchr",
- "regex-syntax 0.8.2",
-]
-
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -3898,9 +3836,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"

 [[package]]
 name = "regex-syntax"
-version = "0.8.2"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"

 [[package]]
 name = "relative-path"
@@ -3926,7 +3864,6 @@ dependencies = [
 "bytes",
 "camino",
 "camino-tempfile",
- "futures",
 "futures-util",
 "http-types",
 "hyper",
@@ -4354,7 +4291,6 @@ dependencies = [
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
- "tokio-util",
 "toml_edit",
 "tracing",
 "url",
@@ -4795,9 +4731,9 @@ dependencies = [

 [[package]]
 name = "socket2"
-version = "0.5.5"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
 dependencies = [
 "libc",
 "windows-sys 0.48.0",
@@ -5144,18 +5080,18 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.34.0"
+version = "1.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
+checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
 dependencies = [
- "backtrace",
+ "autocfg",
 "bytes",
 "libc",
 "mio",
 "num_cpus",
 "pin-project-lite",
 "signal-hook-registry",
- "socket2 0.5.5",
+ "socket2 0.4.9",
 "tokio-macros",
 "windows-sys 0.48.0",
 ]
@@ -5172,9 +5108,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5209,7 +5145,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
- "socket2 0.5.5",
+ "socket2 0.5.3",
 "tokio",
 "tokio-util",
 ]
@@ -5278,16 +5214,13 @@ dependencies = [

 [[package]]
 name = "tokio-util"
-version = "0.7.10"
+version = "0.7.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
 dependencies = [
 "bytes",
 "futures-core",
- "futures-io",
 "futures-sink",
- "futures-util",
- "hashbrown 0.14.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -5765,7 +5698,6 @@ dependencies = [
 "serde",
 "serde_assert",
 "serde_json",
- "serde_path_to_error",
 "serde_with",
 "signal-hook",
 "strum",
@@ -6284,8 +6216,7 @@ dependencies = [
 "prost",
 "rand 0.8.5",
 "regex",
- "regex-automata 0.4.3",
- "regex-syntax 0.8.2",
+ "regex-syntax 0.7.2",
 "reqwest",
 "ring 0.16.20",
 "rustls",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,10 +38,10 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.18"
-azure_identity = "0.18"
-azure_storage = "0.18"
-azure_storage_blobs = "0.18"
+azure_core = "0.16"
+azure_identity = "0.16"
+azure_storage = "0.16"
+azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-regex = "1.10.2"
+regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7.10", features = ["io", "rt"] }
+tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
--- a/38
+++ b/38
@@ -260,44 +260,6 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

-postgres-%-pg-bsd-indent: postgres-%
-	+@echo "Compiling pg_bsd_indent"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
-
-# Create typedef list for the core. Note that generally it should be combined with
-# buildfarm one to cover platform specific stuff.
-# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
-postgres-%-typedefs.list: postgres-%
-	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
-
-# Indent postgres. See src/tools/pgindent/README for details.
-.PHONY: postgres-%-pgindent
-postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
-	+@echo merge with buildfarm typedef to cover all platforms
-	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
-		REL_16_STABLE list misses PGSemaphoreData
-	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
-	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
-		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	+@echo note: you might want to run it on selected files/dirs instead.
-	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
-		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
-
-# Indent pxgn/neon.
-.PHONY: pgindent
-neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
-
-
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,13 +274,7 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
+            drop(state);
            delay_exit = true;
            None
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, RemotePath};
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -277,17 +277,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
-    /// Check that compute node has corresponding feature enabled.
-    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
-        let state = self.state.lock().unwrap();
-
-        if let Some(s) = state.pspec.as_ref() {
-            s.spec.features.contains(&feature)
-        } else {
-            false
-        }
-    }
-
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .query(
+            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
+            &[],
+        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
+            replication: Some(row.get("rolreplication")),
+            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -252,6 +252,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+                || !r.bypassrls.unwrap_or(false)
+                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -283,22 +285,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                // This can be run on /every/ role! Not just ones created through the console.
-                // This means that if you add some funny ALTER here that adds a permission,
-                // this will get run even on user-created roles! This will result in different
-                // behavior before and after a spec gets reapplied. The below ALTER as it stands
-                // now only grants LOGIN and changes the password. Please do not allow this branch
-                // to do anything silly.
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
+                let mut query: String =
+                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                // This branch only runs when roles are created through the console, so it is
-                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -201,12 +201,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
        // TODO(sharding): make this shard-aware
        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
            let valid = tenant_state.generation == req_tenant.gen;
-            tracing::info!(
-                "handle_validate: {}(gen {}): valid={valid} (latest {})",
-                req_tenant.id,
-                req_tenant.gen,
-                tenant_state.generation
-            );
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
                valid,
@@ -256,13 +250,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

-    tracing::info!(
-        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
-        attach_req.tenant_id,
-        tenant_state.generation,
-        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
-    );
-
    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -168,7 +168,7 @@ fn print_timelines_tree(
                    info: t.clone(),
                    children: BTreeSet::new(),
                    name: timeline_name_mappings
-                        .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
+                        .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
                },
            )
        })
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -519,7 +519,6 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -407,7 +407,6 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
-            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };

        let request = models::TenantCreateRequest {
@@ -505,7 +504,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'gc_feedback' as bool")?,
-                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            }
        };

--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -165,7 +165,7 @@ pub fn migrate_tenant(
        let found = other_ps_tenants
            .into_iter()
            .map(|t| t.id)
-            .any(|i| i.tenant_id == tenant_id);
+            .any(|i| i == tenant_id);
        if !found {
            continue;
        }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,13 +26,6 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
-
-    /// Compute features to enable. These feature flags are provided, when we
-    /// know all the details about client's compute, so they cannot be used
-    /// to change `Empty` compute behavior.
-    #[serde(default)]
-    pub features: Vec<ComputeFeature>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

-/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeFeature {
-    // XXX: Add more feature flags here.
-
-    // This is a special feature flag that is used to represent unknown feature flags.
-    // Basically all unknown to enum flags are represented as this one. See unit test
-    // `parse_unknown_features()` for more details.
-    #[serde(other)]
-    UnknownFeature,
-}
-
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -207,6 +187,8 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
+    pub replication: Option<bool>,
+    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

@@ -247,10 +229,7 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
-
-        // Features list defaults to empty vector.
-        assert!(spec.features.is_empty());
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }

    #[test]
@@ -262,22 +241,4 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
-
-    #[test]
-    fn parse_unknown_features() {
-        // Test that unknown feature flags do not cause any errors.
-        let file = File::open("tests/cluster_spec.json").unwrap();
-        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
-        let ob = json.as_object_mut().unwrap();
-
-        // Add unknown feature flags.
-        let features = vec!["foo_bar_feature", "baz_feature"];
-        ob.insert("features".into(), features.into());
-
-        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
-
-        assert!(spec.features.len() == 2);
-        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
-        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
-    }
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -3,11 +3,8 @@
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
-
 use once_cell::sync::Lazy;
-use prometheus::core::{
-    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
-};
+use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
@@ -135,137 +132,3 @@ fn get_rusage_stats() -> libc::rusage {
        rusage.assume_init()
    }
 }
-
-/// Create an [`IntCounterPairVec`] and registers to default registry.
-#[macro_export(local_inner_macros)]
-macro_rules! register_int_counter_pair_vec {
-    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
-        match (
-            $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
-            $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
-        ) {
-            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
-            (Err(e), _) | (_, Err(e)) => Err(e),
-        }
-    }};
-}
-/// Create an [`IntCounterPair`] and registers to default registry.
-#[macro_export(local_inner_macros)]
-macro_rules! register_int_counter_pair {
-    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
-        match (
-            $crate::register_int_counter!($NAME1, $HELP1),
-            $crate::register_int_counter!($NAME2, $HELP2),
-        ) {
-            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
-            (Err(e), _) | (_, Err(e)) => Err(e),
-        }
-    }};
-}
-
-/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
-pub struct GenericCounterPairVec<P: Atomic> {
-    inc: GenericCounterVec<P>,
-    dec: GenericCounterVec<P>,
-}
-
-/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
-pub struct GenericCounterPair<P: Atomic> {
-    inc: GenericCounter<P>,
-    dec: GenericCounter<P>,
-}
-
-impl<P: Atomic> GenericCounterPairVec<P> {
-    pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
-        Self { inc, dec }
-    }
-
-    /// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
-    /// of label values (same order as the VariableLabels in Desc). If that combination of
-    /// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
-    ///
-    /// An error is returned if the number of label values is not the same as the
-    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
-        Ok(GenericCounterPair {
-            inc: self.inc.get_metric_with_label_values(vals)?,
-            dec: self.dec.get_metric_with_label_values(vals)?,
-        })
-    }
-
-    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
-    /// occurs.
-    pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
-        self.get_metric_with_label_values(vals).unwrap()
-    }
-}
-
-impl<P: Atomic> GenericCounterPair<P> {
-    pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
-        Self { inc, dec }
-    }
-
-    /// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
-    pub fn guard(&self) -> GenericCounterPairGuard<P> {
-        self.inc.inc();
-        GenericCounterPairGuard(self.dec.clone())
-    }
-
-    /// Increment the gauge by n, returning a guard that decrements by n on drop.
-    pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
-        self.inc.inc_by(n);
-        GenericCounterPairGuardBy(self.dec.clone(), n)
-    }
-
-    /// Increase the gauge by 1.
-    #[inline]
-    pub fn inc(&self) {
-        self.inc.inc();
-    }
-
-    /// Decrease the gauge by 1.
-    #[inline]
-    pub fn dec(&self) {
-        self.dec.inc();
-    }
-
-    /// Add the given value to the gauge. (The value can be
-    /// negative, resulting in a decrement of the gauge.)
-    #[inline]
-    pub fn inc_by(&self, v: P::T) {
-        self.inc.inc_by(v);
-    }
-
-    /// Subtract the given value from the gauge. (The value can be
-    /// negative, resulting in an increment of the gauge.)
-    #[inline]
-    pub fn dec_by(&self, v: P::T) {
-        self.dec.inc_by(v);
-    }
-}
-
-/// Guard returned by [`GenericCounterPair::guard`]
-pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
-
-impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
-    fn drop(&mut self) {
-        self.0.inc();
-    }
-}
-/// Guard returned by [`GenericCounterPair::guard_by`]
-pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
-
-impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
-    fn drop(&mut self) {
-        self.0.inc_by(self.1);
-    }
-}
-
-/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
-pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
-
-/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
-pub type IntCounterPair = GenericCounterPair<AtomicU64>;
-
-/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
-pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,7 +140,3 @@ impl Key {
        })
    }
 }
-
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -237,7 +237,6 @@ pub struct TenantConfig {
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
-    pub heatmap_period: Option<String>,
 }

 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -324,7 +323,6 @@ impl TenantConfigRequest {

 #[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
-    #[serde(default)]
    pub config: TenantAttachConfig,
    #[serde(default)]
    pub generation: Option<u32>,
@@ -332,7 +330,7 @@ pub struct TenantAttachRequest {

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize, Default)]
+#[derive(Debug, Serialize, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
    #[serde(flatten)]
@@ -358,7 +356,7 @@ pub enum TenantAttachmentStatus {

 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    pub id: TenantShardId,
+    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
@@ -370,7 +368,7 @@ pub struct TenantInfo {
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,

    pub ancestor_timeline_id: Option<TimelineId>,
@@ -386,9 +384,6 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,

-    /// The LSN from the start of the root timeline (never changes)
-    pub initdb_lsn: Lsn,
-
    pub current_logical_size: u64,
    pub current_logical_size_is_accurate: bool,

@@ -827,7 +822,7 @@ mod tests {
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
        let original_active = TenantInfo {
-            id: TenantShardId::unsharded(TenantId::generate()),
+            id: TenantId::generate(),
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
@@ -844,7 +839,7 @@ mod tests {
        });

        let original_broken = TenantInfo {
-            id: TenantShardId::unsharded(TenantId::generate()),
+            id: TenantId::generate(),
            state: TenantState::Broken {
                reason: "reason".into(),
                backtrace: "backtrace info".into(),
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,5 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -73,33 +72,19 @@ impl TenantShardId {
        )
    }

-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-}
-
-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
+    pub fn shard_slug(&self) -> String {
+        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
    }
 }

 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+            write!(
+                f,
+                "{}-{:02x}{:02x}",
+                self.tenant_id, self.shard_number.0, self.shard_count.0
+            )
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
@@ -317,8 +302,6 @@ pub struct ShardStripeSize(pub u32);
 pub struct ShardLayout(u8);

 const LAYOUT_V1: ShardLayout = ShardLayout(1);
-/// ShardIdentity uses a magic layout value to indicate if it is unusable
-const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);

 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
@@ -327,10 +310,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
+    pub layout: ShardLayout,
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
-    layout: ShardLayout,
+    pub stripe_size: ShardStripeSize,
 }

 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
@@ -356,22 +339,6 @@ impl ShardIdentity {
        }
    }

-    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
-    /// which are constructed in code paths that don't have access to proper configuration.
-    ///
-    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
-    /// Enforcement is via assertions, to avoid making our interface fallible for this
-    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
-    /// state, and by extension to avoid trying to do any page->shard resolution.
-    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            number,
-            count,
-            layout: LAYOUT_BROKEN,
-            stripe_size: DEFAULT_STRIPE_SIZE,
-        }
-    }
-
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -398,39 +365,6 @@ impl ShardIdentity {
            })
        }
    }
-
-    fn is_broken(&self) -> bool {
-        self.layout == LAYOUT_BROKEN
-    }
-
-    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
-        assert!(!self.is_broken());
-        key_to_shard_number(self.count, self.stripe_size, key)
-    }
-
-    /// Return true if the key should be ingested by this shard
-    pub fn is_key_local(&self, key: &Key) -> bool {
-        assert!(!self.is_broken());
-        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
-            true
-        } else {
-            key_to_shard_number(self.count, self.stripe_size, key) == self.number
-        }
-    }
-
-    pub fn shard_slug(&self) -> String {
-        if self.count > ShardCount(0) {
-            format!("-{:02x}{:02x}", self.number.0, self.count.0)
-        } else {
-            String::new()
-        }
-    }
-
-    /// Convenience for checking if this identity is the 0th shard in a tenant,
-    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
-        self.number == ShardNumber(0)
-    }
 }

 impl Serialize for ShardIndex {
@@ -504,65 +438,6 @@ impl<'de> Deserialize<'de> for ShardIndex {
    }
 }

-/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
-/// in order to be able to serve basebackup requests without peer communication).
-fn key_is_shard0(key: &Key) -> bool {
-    // To decide what to shard out to shards >0, we apply a simple rule that only
-    // relation pages are distributed to shards other than shard zero. Everything else gets
-    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
-    // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn hash_combine(mut a: u32, mut b: u32) -> u32 {
-    b = b.wrapping_add(0x9e3779b9);
-    b = b.wrapping_add(a << 6);
-    b = b.wrapping_add(a >> 2);
-
-    a ^= b;
-    a
-}
-
-/// Where a Key is to be distributed across shards, select the shard.  This function
-/// does not account for keys that should be broadcast across shards.
-///
-/// The hashing in this function must exactly match what we do in postgres smgr
-/// code.  The resulting distribution of pages is intended to preserve locality within
-/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
-/// distributing data pseudo-randomly.
-///
-/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
-/// and will be handled at higher levels when shards are split.
-fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
-    // Fast path for un-sharded tenants or broadcast keys
-    if count < ShardCount(2) || key_is_shard0(key) {
-        return ShardNumber(0);
-    }
-
-    // relNode
-    let mut hash = murmurhash32(key.field4);
-    // blockNum/stripe size
-    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
-
-    ShardNumber((hash % count.0 as u32) as u8)
-}
-
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
@@ -734,29 +609,4 @@ mod tests {

        Ok(())
    }
-
-    // These are only smoke tests to spot check that our implementation doesn't
-    // deviate from a few examples values: not aiming to validate the overall
-    // hashing algorithm.
-    #[test]
-    fn murmur_hash() {
-        assert_eq!(murmurhash32(0), 0);
-
-        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
-    }
-
-    #[test]
-    fn shard_mapping() {
-        let key = Key {
-            field1: 0x00,
-            field2: 0x67f,
-            field3: 0x5,
-            field4: 0x400c,
-            field5: 0x00,
-            field6: 0x7d06,
-        };
-
-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
-        assert_eq!(shard, ShardNumber(8));
-    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -975,10 +975,4 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
-
-    #[test]
-    fn parse_fe_startup_packet_regression() {
-        let data = [0, 0, 0, 7, 0, 0, 0, 0];
-        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
-    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,11 +16,10 @@ aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
-futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util = { workspace = true, features = ["compat"] }
+tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,24 +1,21 @@
 //! Azure Blob Storage wrapper

-use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
-use std::pin::Pin;
 use std::sync::Arc;
+use std::{borrow::Cow, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
-use bytes::Bytes;
-use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::StatusCode;
+use tokio::io::AsyncRead;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -52,8 +49,7 @@ impl AzureBlobStorage {
            StorageCredentials::token_credential(Arc::new(token_credential))
        };

-        // we have an outer retry
-        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
+        let builder = ClientBuilder::new(account, credentials);

        let client = builder.container_client(azure_config.container_name.to_owned());

@@ -120,8 +116,7 @@ impl AzureBlobStorage {
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
-
-        let mut bufs = Vec::new();
+        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
            if let Some(blob_meta) = part.blob.metadata {
@@ -132,10 +127,10 @@ impl AzureBlobStorage {
                .collect()
                .await
                .map_err(|e| DownloadError::Other(e.into()))?;
-            bufs.push(data);
+            buf.extend_from_slice(&data.slice(..));
        }
        Ok(Download {
-            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            download_stream: Box::pin(Cursor::new(buf)),
            metadata: Some(StorageMetadata(metadata)),
        })
    }
@@ -222,10 +217,9 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(res)
    }
-
    async fn upload(
        &self,
-        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -233,12 +227,13 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Put).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(to));

-        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-            Box::pin(from);
-
-        let from = NonSeekableStream::new(from, data_size_bytes);
-
-        let body = azure_core::Body::SeekableStream(Box::new(from));
+        // TODO FIX THIS UGLY HACK and don't buffer the entire object
+        // into RAM here, but use the streaming interface. For that,
+        // we'd have to change the interface though...
+        // https://github.com/neondatabase/neon/issues/5563
+        let mut buf = Vec::with_capacity(data_size_bytes);
+        tokio::io::copy(&mut from, &mut buf).await?;
+        let body = azure_core::Body::Bytes(buf.into());

        let mut builder = blob_client.put_block_blob(body);

@@ -271,12 +266,17 @@ impl RemoteStorage for AzureBlobStorage {

        let mut builder = blob_client.get();

-        let range: Range = if let Some(end_exclusive) = end_exclusive {
-            (start_inclusive..end_exclusive).into()
+        if let Some(end_exclusive) = end_exclusive {
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        } else {
-            (start_inclusive..).into()
-        };
-        builder = builder.range(range);
+            // Open ranges are not supported by the SDK so we work around
+            // by setting the upper limit extremely high (but high enough
+            // to still be representable by signed 64 bit integers).
+            // TODO remove workaround once the SDK adds open range support
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
+            let end_exclusive = u64::MAX / 4;
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        }

        self.download_for_builder(builder).await
    }
@@ -312,153 +312,3 @@ impl RemoteStorage for AzureBlobStorage {
        Ok(())
    }
 }
-
-pin_project_lite::pin_project! {
-    /// Hack to work around not being able to stream once with azure sdk.
-    ///
-    /// Azure sdk clones streams around with the assumption that they are like
-    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
-    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
-    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
-    /// seekable, but we can also just re-try the request easier.
-    #[project = NonSeekableStreamProj]
-    enum NonSeekableStream<S> {
-        /// A stream wrappers initial form.
-        ///
-        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
-        /// clone before first request, then this must be changed.
-        Initial {
-            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
-            len: usize,
-        },
-        /// The actually readable variant, produced by cloning the Initial variant.
-        ///
-        /// The sdk currently always clones once, even without retry policy.
-        Actual {
-            #[pin]
-            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
-            len: usize,
-            read_any: bool,
-        },
-        /// Most likely unneeded, but left to make life easier, in case more clones are added.
-        Cloned {
-            len_was: usize,
-        }
-    }
-}
-
-impl<S> NonSeekableStream<S>
-where
-    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-{
-    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
-        use tokio_util::compat::TokioAsyncReadCompatExt;
-
-        let inner = tokio_util::io::StreamReader::new(inner).compat();
-        let inner = Some(inner);
-        let inner = std::sync::Mutex::new(inner);
-        NonSeekableStream::Initial { inner, len }
-    }
-}
-
-impl<S> std::fmt::Debug for NonSeekableStream<S> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
-            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
-            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
-        }
-    }
-}
-
-impl<S> futures::io::AsyncRead for NonSeekableStream<S>
-where
-    S: Stream<Item = std::io::Result<Bytes>>,
-{
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut [u8],
-    ) -> std::task::Poll<std::io::Result<usize>> {
-        match self.project() {
-            NonSeekableStreamProj::Actual {
-                inner, read_any, ..
-            } => {
-                *read_any = true;
-                inner.poll_read(cx, buf)
-            }
-            // NonSeekableStream::Initial does not support reading because it is just much easier
-            // to have the mutex in place where one does not poll the contents, or that's how it
-            // seemed originally. If there is a version upgrade which changes the cloning, then
-            // that support needs to be hacked in.
-            //
-            // including {self:?} into the message would be useful, but unsure how to unproject.
-            _ => std::task::Poll::Ready(Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "cloned or initial values cannot be read",
-            ))),
-        }
-    }
-}
-
-impl<S> Clone for NonSeekableStream<S> {
-    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
-    /// request, see type documentation.
-    fn clone(&self) -> Self {
-        use NonSeekableStream::*;
-
-        match self {
-            Initial { inner, len } => {
-                if let Some(inner) = inner.lock().unwrap().take() {
-                    Actual {
-                        inner,
-                        len: *len,
-                        read_any: false,
-                    }
-                } else {
-                    Self::Cloned { len_was: *len }
-                }
-            }
-            Actual { len, .. } => Cloned { len_was: *len },
-            Cloned { len_was } => Cloned { len_was: *len_was },
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl<S> azure_core::SeekableStream for NonSeekableStream<S>
-where
-    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
-{
-    async fn reset(&mut self) -> azure_core::error::Result<()> {
-        use NonSeekableStream::*;
-
-        let msg = match self {
-            Initial { inner, .. } => {
-                if inner.get_mut().unwrap().is_some() {
-                    return Ok(());
-                } else {
-                    "reset after first clone is not supported"
-                }
-            }
-            Actual { read_any, .. } if !*read_any => return Ok(()),
-            Actual { .. } => "reset after reading is not supported",
-            Cloned { .. } => "reset after second clone is not supported",
-        };
-        Err(azure_core::error::Error::new(
-            azure_core::error::ErrorKind::Io,
-            std::io::Error::new(std::io::ErrorKind::Other, msg),
-        ))
-    }
-
-    // Note: it is not documented if this should be the total or remaining length, total passes the
-    // tests.
-    fn len(&self) -> usize {
-        use NonSeekableStream::*;
-        match self {
-            Initial { len, .. } => *len,
-            Actual { len, .. } => *len,
-            Cloned { len_was, .. } => *len_was,
-        }
-    }
-}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,10 +19,8 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

-use bytes::Bytes;
-use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::sync::Semaphore;
+use tokio::{io, sync::Semaphore};
 use toml_edit::Item;
 use tracing::info;

@@ -181,7 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -208,7 +206,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }

 pub struct Download {
-    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -302,7 +300,7 @@ impl GenericRemoteStorage {

    pub async fn upload(
        &self,
-        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -400,7 +398,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,14 +7,11 @@
 use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
-use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
-use futures::stream::Stream;
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -102,35 +99,27 @@ impl LocalFs {
        };

        // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
+        // Otherwise, we must go up to the parent directory.  This is because
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-        loop {
-            // Did we make it to the root?
-            if initial_dir.parent().is_none() {
-                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
-            }
-
-            match fs::metadata(initial_dir.clone()).await {
-                Ok(meta) if meta.is_dir() => {
-                    // We found a directory, break
-                    break;
-                }
-                Ok(_meta) => {
+        match fs::metadata(full_path.clone()).await {
+            Ok(meta) => {
+                if !meta.is_dir() {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
-                Err(e) if e.kind() == ErrorKind::NotFound => {
-                    // It's not a file that exists: strip the prefix back to the parent directory
-                    initial_dir.pop();
-                }
-                Err(e) => {
-                    // Unexpected I/O error
-                    anyhow::bail!(e)
-                }
+            }
+            Err(e) if e.kind() == ErrorKind::NotFound => {
+                // It's not a file that exists: strip the prefix back to the parent directory
+                initial_dir.pop();
+            }
+            Err(e) => {
+                // Unexpected I/O error
+                anyhow::bail!(e)
            }
        }
+
        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
@@ -222,7 +211,7 @@ impl RemoteStorage for LocalFs {

    async fn upload(
        &self,
-        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
+        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -255,12 +244,9 @@ impl RemoteStorage for LocalFs {
        );

        let from_size_bytes = data_size_bytes as u64;
-        let data = tokio_util::io::StreamReader::new(data);
-        let data = std::pin::pin!(data);
        let mut buffer_to_read = data.take(from_size_bytes);

-        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
-        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
+        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
            .await
            .with_context(|| {
                format!(
@@ -314,7 +300,7 @@ impl RemoteStorage for LocalFs {
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = ReaderStream::new(
+            let source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
@@ -354,14 +340,16 @@ impl RemoteStorage for LocalFs {
        }
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = tokio::fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?;
+            let mut source = io::BufReader::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&target_path)
+                    .await
+                    .with_context(|| {
+                        format!("Failed to open source file {target_path:?} to use in the download")
+                    })
+                    .map_err(DownloadError::Other)?,
+            );
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
@@ -375,13 +363,11 @@ impl RemoteStorage for LocalFs {
            Ok(match end_exclusive {
                Some(end_exclusive) => Download {
                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(
-                        source.take(end_exclusive - start_inclusive),
-                    )),
+                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
                },
                None => Download {
                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(source)),
+                    download_stream: Box::pin(source),
                },
            })
        } else {
@@ -481,9 +467,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

-    use bytes::Bytes;
    use camino_tempfile::tempdir;
-    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_assert_remote_file_contents(
@@ -493,7 +477,7 @@ mod fs_tests {
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
-        let download = storage
+        let mut download = storage
            .download(remote_storage_path)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -502,9 +486,13 @@ mod fs_tests {
            "Unexpected metadata returned for the downloaded file"
        );

-        let contents = aggregate(download.download_stream).await?;
-
-        String::from_utf8(contents).map_err(anyhow::Error::new)
+        let mut contents = String::new();
+        download
+            .download_stream
+            .read_to_string(&mut contents)
+            .await
+            .context("Failed to read remote file contents into string")?;
+        Ok(contents)
    }

    #[tokio::test]
@@ -533,26 +521,25 @@ mod fs_tests {
        let storage = create_storage()?;

        let id = RemotePath::new(Utf8Path::new("dummy"))?;
-        let content = Bytes::from_static(b"12345");
-        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
+        let content = std::io::Cursor::new(b"12345");

        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(content(), 0, &id, None)
+            .upload(Box::new(content.clone()), 0, &id, None)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(content(), 4, &id, None)
+            .upload(Box::new(content.clone()), 4, &id, None)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(content(), 6, &id, None)
+            .upload(Box::new(content.clone()), 6, &id, None)
            .await
            .expect_err("upload with too large size succeeded");

        // Correct size is 5, this should succeed.
-        storage.upload(content(), 5, &id, None).await?;
+        storage.upload(Box::new(content), 5, &id, None).await?;

        Ok(())
    }
@@ -600,7 +587,7 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let first_part_download = storage
+        let mut first_part_download = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
@@ -608,13 +595,21 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let first_part_remote = aggregate(first_part_download.download_stream).await?;
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut first_part_download.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
+        first_part_remote.flush().await?;
+        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
-            first_part_local, first_part_remote,
+            first_part_local,
+            first_part_remote.as_slice(),
            "First part bytes should be returned when requested"
        );

-        let second_part_download = storage
+        let mut second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -626,9 +621,17 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let second_part_remote = aggregate(second_part_download.download_stream).await?;
+        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut second_part_download.download_stream,
+            &mut second_part_remote,
+        )
+        .await?;
+        second_part_remote.flush().await?;
+        let second_part_remote = second_part_remote.into_inner().into_inner();
        assert_eq!(
-            second_part_local, second_part_remote,
+            second_part_local,
+            second_part_remote.as_slice(),
            "Second part bytes should be returned when requested"
        );

@@ -718,10 +721,17 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let partial_download_with_metadata = storage
+        let mut partial_download_with_metadata = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
-        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut partial_download_with_metadata.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
+        first_part_remote.flush().await?;
+        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
@@ -797,16 +807,16 @@ mod fs_tests {
                )
            })?;

-        let file = tokio_util::io::ReaderStream::new(file);
-
-        storage.upload(file, size, &relative_path, metadata).await?;
+        storage
+            .upload(Box::new(file), size, &relative_path, metadata)
+            .await?;
        Ok(relative_path)
    }

    async fn create_file_for_upload(
        path: &Utf8Path,
        contents: &str,
-    ) -> anyhow::Result<(fs::File, usize)> {
+    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
        let mut file_for_writing = std::fs::OpenOptions::new()
            .write(true)
@@ -816,7 +826,7 @@ mod fs_tests {
        drop(file_for_writing);
        let file_size = path.metadata()?.len() as usize;
        Ok((
-            fs::OpenOptions::new().read(true).open(&path).await?,
+            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
            file_size,
        ))
    }
@@ -830,16 +840,4 @@ mod fs_tests {
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
-
-    async fn aggregate(
-        stream: impl Stream<Item = std::io::Result<Bytes>>,
-    ) -> anyhow::Result<Vec<u8>> {
-        use futures::stream::StreamExt;
-        let mut out = Vec::new();
-        let mut stream = std::pin::pin!(stream);
-        while let Some(res) = stream.next().await {
-            out.extend_from_slice(&res?[..]);
-        }
-        Ok(out)
-    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,9 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::{
-    borrow::Cow,
-    pin::Pin,
-    sync::Arc,
-    task::{Context, Poll},
-};
+use std::{borrow::Cow, sync::Arc};

-use anyhow::Context as _;
+use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -33,10 +28,11 @@ use aws_smithy_async::rt::sleep::TokioSleep;

 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
-use bytes::Bytes;
-use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
+use tokio::io::{self, AsyncRead};
+use tokio_util::io::ReaderStream;
+use tracing::debug;

 use super::StorageMetadata;
 use crate::{
@@ -67,7 +63,7 @@ struct GetObjectRequest {
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        tracing::debug!(
+        debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
@@ -229,15 +225,12 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
-
-                let body = object_output.body;
-                let body = ByteStreamAsStream::from(body);
-                let body = PermitCarrying::new(permit, body);
-                let body = TimedDownload::new(started_at, body);
-
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(body),
+                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
+                        started_at,
+                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
+                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -250,55 +243,29 @@ impl S3Bucket {
    }
 }

-pin_project_lite::pin_project! {
-    struct ByteStreamAsStream {
-        #[pin]
-        inner: aws_smithy_types::byte_stream::ByteStream
-    }
-}
-
-impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
-    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
-        ByteStreamAsStream { inner }
-    }
-}
-
-impl Stream for ByteStreamAsStream {
-    type Item = std::io::Result<Bytes>;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        // this does the std::io::ErrorKind::Other conversion
-        self.project().inner.poll_next(cx).map_err(|x| x.into())
-    }
-
-    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
-    // sense and Stream::size_hint does not really
-}
-
 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct PermitCarrying<S> {
+    struct RatelimitedAsyncRead<S> {
        permit: tokio::sync::OwnedSemaphorePermit,
        #[pin]
        inner: S,
    }
 }

-impl<S> PermitCarrying<S> {
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        Self { permit, inner }
+        RatelimitedAsyncRead { permit, inner }
    }
 }

-impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        self.project().inner.poll_next(cx)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
    }
 }

@@ -318,7 +285,7 @@ pin_project_lite::pin_project! {
    }
 }

-impl<S> TimedDownload<S> {
+impl<S: AsyncRead> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
@@ -328,26 +295,25 @@ impl<S> TimedDownload<S> {
    }
 }

-impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        use std::task::ready;
-
+impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
        let this = self.project();
+        let before = buf.filled().len();
+        let read = std::task::ready!(this.inner.poll_read(cx, buf));

-        let res = ready!(this.inner.poll_next(cx));
-        match &res {
-            Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            None => *this.outcome = metrics::AttemptOutcome::Ok,
+        let read_eof = buf.filled().len() == before;
+
+        match read {
+            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
+            Ok(()) => { /* still in progress */ }
+            Err(_) => *this.outcome = AttemptOutcome::Err,
        }

-        Poll::Ready(res)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
+        std::task::Poll::Ready(read)
    }
 }

@@ -412,7 +378,7 @@ impl RemoteStorage for S3Bucket {
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);

-            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());

            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
@@ -437,7 +403,7 @@ impl RemoteStorage for S3Bucket {

    async fn upload(
        &self,
-        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -447,7 +413,7 @@ impl RemoteStorage for S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let body = Body::wrap_stream(from);
+        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

        let res = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,8 +1,6 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
-use bytes::Bytes;
-use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -110,7 +108,7 @@ impl RemoteStorage for UnreliableWrapper {

    async fn upload(
        &self,
-        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -7,9 +7,7 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -182,14 +180,23 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;

-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;

-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -212,56 +219,53 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+    let data = "remote blob data here".as_bytes();
+    let data_len = data.len() as u64;

-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy_buf(
-            &mut tokio_util::io::StreamReader::new(dl.download_stream),
-            &mut buf,
-        )
+    ctx.client
+        .upload(std::io::Cursor::new(data), data.len(), &path, None)
        .await?;
+
+    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+        let mut buf = Vec::new();
+        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
        Ok(buf)
    }
    // Normal download request
    let dl = ctx.client.download(&path).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig);
+    assert_eq!(buf, data);

    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(len as u64))
+        .download_byte_range(&path, 0, Some(data_len))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig);
+    assert_eq!(buf, data);

    // partial range (end specified)
    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
+    assert_eq!(buf, data[4..10]);

    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .download_byte_range(&path, 8, Some(data_len * 100))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
+    assert_eq!(buf, data[8..]);

    // Partial range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
+    assert_eq!(buf, data[4..]);

    // Full range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(&buf, &orig);
+    assert_eq!(buf, data);

    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
@@ -500,8 +504,11 @@ async fn upload_azure_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -582,8 +589,11 @@ async fn upload_simple_azure_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -612,32 +622,3 @@ async fn upload_simple_azure_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
-
-// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
-// to binary
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -7,9 +7,7 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -178,14 +176,23 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;

-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;

-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -425,9 +432,11 @@ async fn upload_s3_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -508,9 +517,11 @@ async fn upload_simple_s3_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -539,30 +550,3 @@ async fn upload_simple_s3_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
-
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -50,8 +50,6 @@ const_format.workspace = true
 # why is it only here? no other crate should use it, streams are rarely needed.
 tokio-stream = { version = "0.1.14" }

-serde_path_to_error.workspace = true
-
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,14 +1,16 @@
-use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};

 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(TaskTrackerToken);
+pub struct Completion(mpsc::Sender<()>);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
-pub struct Barrier(TaskTracker);
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

 impl Default for Barrier {
    fn default() -> Self {
@@ -19,7 +21,7 @@ impl Default for Barrier {

 impl Barrier {
    pub async fn wait(self) {
-        self.0.wait().await;
+        self.0.lock().await.recv().await;
    }

    pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -31,7 +33,8 @@ impl Barrier {

 impl PartialEq for Barrier {
    fn eq(&self, other: &Self) -> bool {
-        TaskTracker::ptr_eq(&self.0, &other.0)
+        // we don't use dyn so this is good
+        Arc::ptr_eq(&self.0, &other.0)
    }
 }

@@ -39,10 +42,8 @@ impl Eq for Barrier {}

 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
-    let tracker = TaskTracker::new();
-    // otherwise wait never exits
-    tracker.close();
-
-    let token = tracker.token();
-    (Completion(token), Barrier(tracker))
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,16 +152,3 @@ impl Debug for Generation {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn generation_gt() {
-        // Important that a None generation compares less than a valid one, during upgrades from
-        // pre-generation systems.
-        assert!(Generation::none() < Generation::new(0));
-        assert!(Generation::none() < Generation::new(1));
-    }
-}
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -25,12 +25,8 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    if body.remaining() == 0 {
        return Ok(None);
    }
-
-    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
-
-    serde_path_to_error::deserialize(&mut deser)
-        // intentionally stringify because the debug version is not helpful in python logs
-        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
+    serde_json::from_reader(body.reader())
+        .context("Failed to parse json request")
        .map(Some)
        .map_err(ApiError::BadRequest)
 }
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,7 +1,6 @@
 use std::str::FromStr;

 use anyhow::Context;
-use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};

@@ -25,48 +24,16 @@ impl LogFormat {
    }
 }

-struct TracingEventCountMetric {
-    error: IntCounter,
-    warn: IntCounter,
-    info: IntCounter,
-    debug: IntCounter,
-    trace: IntCounter,
-}
-
-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
-    let vec = metrics::register_int_counter_vec!(
+static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+    metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
        &["level"]
    )
-    .expect("failed to define metric");
-    TracingEventCountMetric::new(vec)
+    .expect("failed to define metric")
 });

-impl TracingEventCountMetric {
-    fn new(vec: IntCounterVec) -> Self {
-        Self {
-            error: vec.with_label_values(&["error"]),
-            warn: vec.with_label_values(&["warn"]),
-            info: vec.with_label_values(&["info"]),
-            debug: vec.with_label_values(&["debug"]),
-            trace: vec.with_label_values(&["trace"]),
-        }
-    }
-
-    fn inc_for_level(&self, level: tracing::Level) {
-        let counter = match level {
-            tracing::Level::ERROR => &self.error,
-            tracing::Level::WARN => &self.warn,
-            tracing::Level::INFO => &self.info,
-            tracing::Level::DEBUG => &self.debug,
-            tracing::Level::TRACE => &self.trace,
-        };
-        counter.inc();
-    }
-}
-
-struct TracingEventCountLayer(&'static TracingEventCountMetric);
+struct TracingEventCountLayer(&'static metrics::IntCounterVec);

 impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
 where
@@ -77,7 +44,15 @@ where
        event: &tracing::Event<'_>,
        _ctx: tracing_subscriber::layer::Context<'_, S>,
    ) {
-        self.0.inc_for_level(*event.metadata().level());
+        let level = event.metadata().level();
+        let level = match *level {
+            tracing::Level::ERROR => "error",
+            tracing::Level::WARN => "warn",
+            tracing::Level::INFO => "info",
+            tracing::Level::DEBUG => "debug",
+            tracing::Level::TRACE => "trace",
+        };
+        self.0.with_label_values(&[level]).inc();
    }
 }

@@ -131,9 +106,7 @@ pub fn init(
        };
        log_layer.with_filter(rust_log_env_filter())
    });
-    let r = r.with(
-        TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
-    );
+    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
    match tracing_error_layer_enablement {
        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -284,14 +257,14 @@ impl std::fmt::Debug for SecretString {
 mod tests {
    use metrics::{core::Opts, IntCounterVec};

-    use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
+    use super::TracingEventCountLayer;

    #[test]
    fn tracing_event_count_metric() {
        let counter_vec =
            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
-        let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
-        let layer = TracingEventCountLayer(metric);
+        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
+        let layer = TracingEventCountLayer(counter_vec);
        use tracing_subscriber::prelude::*;

        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new value without blocking
-//! readers. When you update the value, the new value is immediately visible
+//! without blocking writers, and allows writing a new values without blocking
+//! readers. When you update the new value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finished, so that on return, no one sees the old value anymore.
+//! finishe, so that no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,7 +26,6 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
-//! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -37,17 +36,15 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait().await;
-//! # }
+//! waitlist.wait();
 //! ```
 //!
 #![warn(missing_docs)]

 use std::ops::Deref;
+use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{RwLock, RwLockWriteGuard};
-
-use tokio::sync::watch;
+use std::sync::{Mutex, RwLock, RwLockWriteGuard};

 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -71,21 +68,22 @@ struct RcuCell<V> {
    value: V,

    /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
+    /// that when the RcuCell is dropped, any cloned Senders will be notified
    /// that the channel is closed. Updaters can use this to wait out until the
    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
    ///
-    /// We never send anything to this, we just need to hold onto it so that the
-    /// Receivers will be notified when it's dropped.
-    watch: watch::Sender<()>,
+    /// We never do anything with the receiver, we just need to hold onto it so
+    /// that the Senders will be notified when it's dropped. But because it's
+    /// not Sync, we need a Mutex on it.
+    watch: (SyncSender<()>, Mutex<Receiver<()>>),
 }

 impl<V> RcuCell<V> {
    fn new(value: V) -> Self {
-        let (watch_sender, _) = watch::channel(());
+        let (watch_sender, watch_receiver) = sync_channel(0);
        RcuCell {
            value,
-            watch: watch_sender,
+            watch: (watch_sender, Mutex::new(watch_receiver)),
        }
    }
 }
@@ -143,10 +141,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
-/// held for a short duration!
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
+/// it should only be held for a short duration!
 ///
-/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
+/// Calling `store` consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -181,7 +179,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
            // the watches for any that do.
            self.inner.old_cells.retain(|weak| {
                if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.subscribe());
+                    watches.push(cell.watch.0.clone());
                    true
                } else {
                    false
@@ -195,20 +193,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<watch::Receiver<()>>);
+pub struct RcuWaitList(Vec<SyncSender<()>>);

 impl RcuWaitList {
    ///
    /// Wait for old readers to finish.
    ///
-    pub async fn wait(mut self) {
+    pub fn wait(mut self) {
        // after all the old_cells are no longer in use, we're done
        for w in self.0.iter_mut() {
            // This will block until the Receiver is closed. That happens when
            // the RcuCell is dropped.
            #[allow(clippy::single_match)]
-            match w.changed().await {
-                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
+            match w.send(()) {
+                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
                Err(_) => {
                    // closed, which means that the cell has been dropped, and
                    // its value is no longer in use
@@ -222,10 +220,11 @@ impl RcuWaitList {
 mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};
+    use std::thread::{sleep, spawn};
    use std::time::Duration;

-    #[tokio::test]
-    async fn two_writers() {
+    #[test]
+    fn two_writers() {
        let rcu = Rcu::new(1);

        let read1 = rcu.read();
@@ -249,35 +248,33 @@ mod tests {
        assert_eq!(*read1, 1);

        let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate tasks.
+        // Wait for the old readers to finish in separate threads.
        let log_clone = Arc::clone(&log);
-        let task2 = tokio::spawn(async move {
-            wait2.wait().await;
+        let thread2 = spawn(move || {
+            wait2.wait();
            log_clone.lock().unwrap().push("wait2 done");
        });
        let log_clone = Arc::clone(&log);
-        let task3 = tokio::spawn(async move {
-            wait3.wait().await;
+        let thread3 = spawn(move || {
+            wait3.wait();
            log_clone.lock().unwrap().push("wait3 done");
        });

        // without this sleep the test can pass on accident if the writer is slow
-        tokio::time::sleep(Duration::from_millis(100)).await;
+        sleep(Duration::from_millis(500));

        // Release first reader. This allows first write to finish, but calling
-        // wait() on the 'task3' would still block.
+        // wait() on the second one would still block.
        log.lock().unwrap().push("dropping read1");
        drop(read1);
-        task2.await.unwrap();
+        thread2.join().unwrap();

-        assert!(!task3.is_finished());
-
-        tokio::time::sleep(Duration::from_millis(100)).await;
+        sleep(Duration::from_millis(500));

        // Release second reader, and finish second writer.
        log.lock().unwrap().push("dropping read2");
        drop(read2);
-        task3.await.unwrap();
+        thread3.join().unwrap();

        assert_eq!(
            log.lock().unwrap().as_slice(),
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -30,32 +30,18 @@ async fn warn_if_stuck<Fut: std::future::Future>(

    let mut fut = std::pin::pin!(fut);

-    let mut warned = false;
-    let ret = loop {
+    loop {
        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => break ret,
+            Ok(ret) => return ret,
            Err(_) => {
                tracing::warn!(
                    gate = name,
                    elapsed_ms = started.elapsed().as_millis(),
                    "still waiting, taking longer than expected..."
                );
-                warned = true;
            }
        }
-    };
-
-    // If we emitted a warning for slowness, also emit a message when we complete, so that
-    // someone debugging a shutdown can know for sure whether we have moved past this operation.
-    if warned {
-        tracing::info!(
-            gate = name,
-            elapsed_ms = started.elapsed().as_millis(),
-            "completed, after taking longer than expected"
-        )
    }
-
-    ret
 }

 #[derive(Debug)]
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -436,9 +436,9 @@ mod tests {
                event_mask: 0,
            }),
            expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
        // walproposer will panic when it finishes sync_safekeepers
        std::panic::catch_unwind(|| wp.start()).unwrap_err();
        // validate the resulting LSN
-        assert_eq!(receiver.try_recv(), Ok(1337));
+        assert_eq!(receiver.recv()?, 1337);
        Ok(())
        // drop() will free up resources here
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,7 +36,6 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
-md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::{secondary, TenantSharedResources};
+use pageserver::tenant::TenantSharedResources;
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -402,17 +402,16 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
-        warmup_limit: Arc::new(tokio::sync::Semaphore::new(
-            conf.concurrent_tenant_size_logical_size_queries
-                .initial_permits()
-                .get(),
-        )),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
@@ -430,6 +429,7 @@ fn start_pageserver(
    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx;
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -464,7 +464,7 @@ fn start_pageserver(
            });

            let WaitForPhaseResult {
-                timeout_remaining: _timeout,
+                timeout_remaining: timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

@@ -472,6 +472,26 @@ fn start_pageserver(

            scopeguard::ScopeGuard::into_inner(guard);

+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });
+
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -494,6 +514,9 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
            scopeguard::ScopeGuard::into_inner(guard);

            startup_checkpoint(started_startup_at, "complete", "Startup complete");
@@ -509,17 +532,6 @@ fn start_pageserver(
        }
    });

-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
-
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -549,7 +561,6 @@ fn start_pageserver(
                broker_client.clone(),
                disk_usage_eviction_state,
                deletion_queue.new_client(),
-                secondary_controller,
            )
            .context("Failed to initialize router state")?,
        );
@@ -576,6 +587,7 @@ fn start_pageserver(
    }

    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
        let metrics_ctx = RequestContext::todo_child(
            TaskKind::MetricsCollection,
            // This task itself shouldn't download anything.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -70,8 +70,6 @@ pub mod defaults {
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

-    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -119,8 +117,6 @@ pub mod defaults {
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false

-#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-
 [remote_storage]

 "#
@@ -219,10 +215,6 @@ pub struct PageServerConf {
    /// If true, pageserver will make best-effort to operate without a control plane: only
    /// for use in major incidents.
    pub control_plane_emergency_mode: bool,
-
-    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
-    /// heatmap uploads vs. other remote storage operations.
-    pub heatmap_upload_concurrency: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -301,8 +293,6 @@ struct PageServerConfigBuilder {
    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
    control_plane_emergency_mode: BuilderValue<bool>,
-
-    heatmap_upload_concurrency: BuilderValue<usize>,
 }

 impl Default for PageServerConfigBuilder {
@@ -371,8 +361,6 @@ impl Default for PageServerConfigBuilder {
            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
            control_plane_emergency_mode: Set(false),
-
-            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
        }
    }
 }
@@ -513,10 +501,6 @@ impl PageServerConfigBuilder {
        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
    }

-    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
-        self.heatmap_upload_concurrency = BuilderValue::Set(value)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -611,10 +595,6 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-
-            heatmap_upload_concurrency: self
-                .heatmap_upload_concurrency
-                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
        })
    }
 }
@@ -848,9 +828,7 @@ impl PageServerConf {
                },
                "control_plane_emergency_mode" => {
                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-                },
-                "heatmap_upload_concurrency" => {
-                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
+
                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -918,7 +896,6 @@ impl PageServerConf {
            control_plane_api: None,
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
-            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
        }
    }
 }
@@ -1143,8 +1120,7 @@ background_task_maximum_delay = '334 s'
                )?,
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                control_plane_emergency_mode: false
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1201,8 +1177,7 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                control_plane_emergency_mode: false
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -256,6 +256,8 @@ async fn calculate_synthetic_size_worker(
        info!("calculate_synthetic_size_worker stopped");
    };

+    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
+
    loop {
        let started_at = Instant::now();

@@ -267,25 +269,26 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_shard_id, tenant_state) in tenants {
+        for (tenant_id, tenant_state) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }

-            if !tenant_shard_id.is_zero() {
-                // We only send consumption metrics from shard 0, so don't waste time calculating
-                // synthetic size on other shards.
-                continue;
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
+                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
+                    if let Some(PageReconstructError::Cancelled) =
+                        e.downcast_ref::<PageReconstructError>()
+                    {
+                        return Ok(());
+                    }
+                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
+                }
            }
-
-            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
-                continue;
-            };
-
-            // there is never any reason to exit calculate_synthetic_size_worker following any
-            // return value -- we don't need to care about shutdown because no tenant is found when
-            // pageserver is shut down.
-            calculate_and_log(&tenant, cancel, ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
@@ -296,7 +299,7 @@ async fn calculate_synthetic_size_worker(

        let res = tokio::time::timeout_at(
            started_at + synthetic_size_calculation_interval,
-            cancel.cancelled(),
+            task_mgr::shutdown_token().cancelled(),
        )
        .await;
        if res.is_ok() {
@@ -304,31 +307,3 @@ async fn calculate_synthetic_size_worker(
        }
    }
 }
-
-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
-    const CAUSE: LogicalSizeCalculationCause =
-        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
-
-    // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
-    // We can put in some prioritization for consumption metrics.
-    // Same for the loop that fetches computed metrics.
-    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-    // which turns out is really handy to understand the system.
-    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
-        return;
-    };
-
-    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate. we do not need any checks
-    // in this function because `mgr::get_tenant` will error out after shutdown has
-    // progressed to shutting down tenants.
-    let shutting_down = matches!(
-        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
-    );
-
-    if !shutting_down {
-        let tenant_shard_id = tenant.tenant_shard_id();
-        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
-    }
-}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -197,12 +197,12 @@ pub(super) async fn collect_all_metrics(
    };

    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active {
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
                .ok()
-                .map(|tenant| (id.tenant_id, tenant))
+                .map(|tenant| (id, tenant))
        }
    });

@@ -351,12 +351,7 @@ impl TimelineSnapshot {

            let current_exact_logical_size = {
                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let size = span.in_scope(|| {
-                    t.get_current_logical_size(
-                        crate::tenant::timeline::GetLogicalSizePriority::Background,
-                        ctx,
-                    )
-                });
+                let size = span.in_scope(|| t.get_current_logical_size(ctx));
                match size {
                    // Only send timeline logical size when it is fully calculated.
                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -312,18 +312,7 @@ impl ListWriter {
                for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
                    if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                        if attached_gen.previous() == tenant_list.generation {
-                            info!(
-                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
-                                shard_id=%tenant_shard_id.shard_slug(),
-                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
-                                "Updating gen on recovered list");
                            tenant_list.generation = *attached_gen;
-                        } else {
-                            info!(
-                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
-                                shard_id=%tenant_shard_id.shard_slug(),
-                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
-                                "Encountered stale generation on recovered list");
                        }
                    }
                }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,6 +42,7 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

 use std::{
+    collections::HashMap,
    sync::Arc,
    time::{Duration, SystemTime},
 };
@@ -124,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -148,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -186,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -274,9 +268,8 @@ struct LayerCount {
    count: usize,
 }

-pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    _storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -328,16 +321,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Walk through the list of candidates, until we have accumulated enough layers to get
    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
    // how much disk space would be used after evicting all the layers up to the current
-    // point in the list.
+    // point in the list. The layers are collected in 'batched', grouped per timeline.
    //
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut evicted_amount = 0;
-
-    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+    let mut max_batch_size = 0;
+    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
@@ -346,13 +339,25 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            break;
        }

-        if partition == &MinResidentSizePartition::Below && warned.is_none() {
+        if partition == MinResidentSizePartition::Below && warned.is_none() {
            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
            warned = Some(usage_planned);
        }

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-        evicted_amount += 1;
+
+        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        // tasks to evict all seen layers until we have evicted enough
+
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -367,79 +372,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2: evict layers
+    // phase2: evict victims batched by timeline

    let mut js = tokio::task::JoinSet::new();
-    let limit = 1000;

-    let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
-    let mut consumed_all = false;
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));

-    // After the evictions, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    for (timeline, batch) in batched {
+        let tenant_shard_id = timeline.tenant_shard_id;
+        let timeline_id = timeline.timeline_id;
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");

-    let evict_layers = async move {
-        loop {
-            let next = if js.len() >= limit || consumed_all {
-                js.join_next().await
-            } else if !js.is_empty() {
-                // opportunistically consume ready result, one per each new evicted
-                futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
-            } else {
-                None
-            };
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

-            if let Some(next) = next {
-                match next {
-                    Ok(Ok(file_size)) => {
-                        usage_assumed.add_available_bytes(file_size);
+        debug!(%timeline_id, "evicting batch for timeline");
+
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();
+
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
+                            }
+                        }
                    }
-                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
-                        evictions_failed.file_sizes += file_size;
-                        evictions_failed.count += 1;
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
                    }
-                    Err(je) if je.is_cancelled() => unreachable!("not used"),
-                    Err(je) if je.is_panic() => { /* already logged */ }
-                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                }
+                (evicted_bytes, evictions_failed)
            }
-
-            if consumed_all && js.is_empty() {
-                break;
-            }
-
-            // calling again when consumed_all is fine as evicted is fused.
-            let Some((_partition, candidate)) = evicted.next() else {
-                consumed_all = true;
-                continue;
-            };
-
-            js.spawn(async move {
-                let rtc = candidate.timeline.remote_client.as_ref().expect(
-                    "holding the witness, all timelines must have a remote timeline client",
-                );
-                let file_size = candidate.layer.layer_desc().file_size;
-                candidate
-                    .layer
-                    .evict_and_wait(rtc)
-                    .await
-                    .map(|()| file_size)
-                    .map_err(|e| (file_size, e))
-            });
-
-            tokio::task::yield_now().await;
        }
+        .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));

+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
        (usage_assumed, evictions_failed)
    };

    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = evict_layers => { tuple },
+        tuple = join_all => { tuple },
        _ = cancel.cancelled() => {
-            // dropping joinset will abort all pending evict_and_waits and that is fine, our
-            // requests will still stand
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
    };
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -84,6 +84,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    get:
      description: Get tenant status
      responses:
@@ -180,6 +181,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    get:
      description: Get timelines for tenant
      responses:
@@ -230,6 +232,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -335,6 +338,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -397,6 +401,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -464,6 +469,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -517,6 +523,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    post:
      description: |
        Schedules attach operation to happen in the background for the given tenant.
@@ -624,6 +631,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: flush_ms
        in: query
        required: false
@@ -716,6 +724,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: detach_ignored
        in: query
        required: false
@@ -775,6 +784,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    post:
      description: |
        Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -823,6 +833,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    post:
      description: |
        Schedules an operation that attempts to load a tenant from the local disk and
@@ -879,6 +890,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    get:
      description: |
        Calculate tenant's synthetic size
@@ -921,6 +933,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
      - name: inputs_only
        in: query
        required: false
@@ -990,10 +1003,11 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    post:
      description: |
-        Create a timeline. Returns new timeline id on success.
-        Recreating the same timeline will succeed if the parameters match the existing timeline.
+        Create a timeline. Returns new timeline id on success.\
+        If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
        If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
      requestBody:
        content:
@@ -1123,6 +1137,7 @@ paths:
            application/json:
              schema:
                type: string
+                format: hex
        "400":
          description: Malformed tenant create request
          content:
@@ -1219,6 +1234,7 @@ paths:
        required: true
        schema:
          type: string
+          format: hex
    get:
      description: |
        Returns tenant's config description: specific config overrides a tenant has
@@ -1324,6 +1340,7 @@ components:
          properties:
            new_tenant_id:
              type: string
+              format: hex
            generation:
              type: integer
              description: Attachment generation number.
@@ -1352,6 +1369,7 @@ components:
          properties:
            tenant_id:
              type: string
+              format: hex
    TenantLocationConfigRequest:
      type: object
      required:
@@ -1359,6 +1377,7 @@ components:
      properties:
        tenant_id:
          type: string
+          format: hex
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1405,8 +1424,6 @@ components:
          type: integer
        trace_read_requests:
          type: boolean
-        heatmap_period:
-          type: integer
    TenantConfigResponse:
      type: object
      properties:
@@ -1429,6 +1446,7 @@ components:
          format: hex
        tenant_id:
          type: string
+          format: hex
        last_record_lsn:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,12 +38,10 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
-use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
-use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
@@ -68,11 +66,6 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;

-// For APIs that require an Active tenant, how long should we block waiting for that state?
-// This is not functionally necessary (clients will retry), but avoids generating a lot of
-// failed API calls while tenants are activating.
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
-
 pub struct State {
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
@@ -82,11 +75,9 @@ pub struct State {
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
-    secondary_controller: SecondaryController,
 }

 impl State {
-    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
@@ -95,7 +86,6 @@ impl State {
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
-        secondary_controller: SecondaryController,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
@@ -110,7 +100,6 @@ impl State {
            broker_client,
            disk_usage_eviction_state,
            deletion_queue_client,
-            secondary_controller,
        })
    }

@@ -147,6 +136,11 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::NeedsDownload(_, _) => {
+                // This shouldn't happen, because we use a RequestContext that requests to
+                // download any missing layer files on-demand.
+                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -239,19 +233,6 @@ impl From<GetTenantError> for ApiError {
    }
 }

-impl From<GetActiveTenantError> for ApiError {
-    fn from(e: GetActiveTenantError) -> ApiError {
-        match e {
-            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
-            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
-            GetActiveTenantError::NotFound(gte) => gte.into(),
-            GetActiveTenantError::WaitForActiveTimeout { .. } => {
-                ApiError::ResourceUnavailable(format!("{}", e).into())
-            }
-        }
-    }
-}
-
 impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
@@ -338,7 +319,6 @@ async fn build_timeline_info_common(
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
-    let initdb_lsn = timeline.initdb_lsn;
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -358,8 +338,7 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size =
-        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
+    let current_logical_size = timeline.get_current_logical_size(ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -372,14 +351,14 @@ async fn build_timeline_info_common(
    let walreceiver_status = timeline.walreceiver_status();

    let info = TimelineInfo {
-        tenant_id: timeline.tenant_shard_id,
+        // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
+        tenant_id: timeline.tenant_shard_id.tenant_id,
        timeline_id: timeline.timeline_id,
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
        remote_consistent_lsn: remote_consistent_lsn_projected,
        remote_consistent_lsn_visible,
-        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -454,10 +433,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -475,7 +451,7 @@ async fn timeline_create_handler(
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
+            Err(tenant::CreateTimelineError::AlreadyExists) => {
                json_response(StatusCode::CONFLICT, ())
            }
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
@@ -503,15 +479,15 @@ async fn timeline_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -530,9 +506,7 @@ async fn timeline_list_handler(
        }
        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
-    .instrument(info_span!("timeline_list",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug()))
+    .instrument(info_span!("timeline_list", %tenant_id))
    .await?;

    json_response(StatusCode::OK, response_data)
@@ -542,17 +516,17 @@ async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -569,10 +543,7 @@ async fn timeline_detail_handler(

        Ok::<_, ApiError>(timeline_info)
    }
-    .instrument(info_span!("timeline_detail",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug(),
-                %timeline_id))
+    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
    .await?;

    json_response(StatusCode::OK, timeline_info)
@@ -582,15 +553,8 @@ async fn get_lsn_by_timestamp_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    if !tenant_shard_id.is_zero() {
-        // Requires SLRU contents, which are only stored on shard zero
-        return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
-        )));
-    }
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;

    let version: Option<u8> = parse_query_param(&request, "version")?;

@@ -602,7 +566,7 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -637,15 +601,8 @@ async fn get_timestamp_of_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    if !tenant_shard_id.is_zero() {
-        // Requires SLRU contents, which are only stored on shard zero
-        return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
-        )));
-    }
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;

    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;

@@ -655,7 +612,7 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -716,23 +673,11 @@ async fn timeline_delete_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id, false)
-        .map_err(|e| {
-            match e {
-                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
-                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
-                e => e.into(),
-            }
-        })?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
+        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -763,26 +708,6 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_reset_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-    let state = get_state(&request);
-    state
-        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -859,11 +784,11 @@ async fn tenant_status(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -873,15 +798,13 @@ async fn tenant_status(

        let state = tenant.current_state();
        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_shard_id,
+            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
            attachment_status: state.attachment_status(),
        })
    }
-    .instrument(info_span!("tenant_status_handler",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug()))
+    .instrument(info_span!("tenant_status_handler", %tenant_id))
    .await?;

    json_response(StatusCode::OK, tenant_info)
@@ -900,7 +823,7 @@ async fn tenant_delete_handler(
    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard = %tenant_shard_id.shard_slug()
+            shard = tenant_shard_id.shard_slug()
        ))
        .await?;

@@ -924,20 +847,14 @@ async fn tenant_size_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
-
-    if !tenant_shard_id.is_zero() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
-        )));
-    }
+    let tenant = mgr::get_tenant(tenant_id, true)?;

    // this can be long operation
    let inputs = tenant
@@ -989,7 +906,7 @@ async fn tenant_size_handler(
    json_response(
        StatusCode::OK,
        TenantHistorySize {
-            id: tenant_shard_id.tenant_id,
+            id: tenant_id,
            size: sizes.as_ref().map(|x| x.total_size),
            segment_sizes: sizes.map(|x| x.segments),
            inputs,
@@ -1001,14 +918,14 @@ async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);

-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
@@ -1018,12 +935,13 @@ async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -1034,7 +952,7 @@ async fn layer_download_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -1043,12 +961,12 @@ async fn evict_timeline_layer_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -1059,7 +977,7 @@ async fn evict_timeline_layer_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -1170,10 +1088,7 @@ async fn tenant_create_handler(

    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
-        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
        // This shouldn't happen because we just created the tenant directory
        // in tenant::mgr::create_tenant, and there aren't any remote timelines
        // to load, so, nothing can really fail during load.
@@ -1194,10 +1109,10 @@ async fn get_tenant_config_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false)?;

    let response = HashMap::from([
        (
@@ -1257,7 +1172,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard = %tenant_shard_id.shard_slug()
+                    shard = tenant_shard_id.shard_slug()
                ))
                .await
        {
@@ -1291,9 +1206,9 @@ async fn handle_tenant_break(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

-    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
+    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1334,15 +1249,14 @@ async fn timeline_gc_handler(
    mut request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
-        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1357,9 +1271,9 @@ async fn timeline_compact_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1367,14 +1281,14 @@ async fn timeline_compact_handler(
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
    .await
 }

@@ -1383,9 +1297,9 @@ async fn timeline_checkpoint_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1393,7 +1307,7 @@ async fn timeline_checkpoint_handler(
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
@@ -1405,7 +1319,7 @@ async fn timeline_checkpoint_handler(

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
    .await
 }

@@ -1413,12 +1327,12 @@ async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1429,11 +1343,11 @@ async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1479,9 +1393,9 @@ async fn getpage_at_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    struct Key(crate::repository::Key);

@@ -1500,7 +1414,7 @@ async fn getpage_at_lsn_handler(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;

        let page = timeline.get(key.0, lsn, &ctx).await?;

@@ -1512,7 +1426,7 @@ async fn getpage_at_lsn_handler(
                .unwrap(),
        )
    }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
    .await
 }

@@ -1520,9 +1434,9 @@ async fn timeline_collect_keyspace(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

    struct Partitioning {
        keys: crate::keyspace::KeySpace,
@@ -1591,7 +1505,7 @@ async fn timeline_collect_keyspace(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1600,15 +1514,15 @@ async fn timeline_collect_keyspace(

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
-    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
    .await
 }

 async fn active_timeline_of_active_tenant(
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1630,7 +1544,7 @@ async fn always_panic_handler(

 async fn disk_usage_eviction_run(
    mut r: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;

@@ -1658,48 +1572,57 @@ async fn disk_usage_eviction_run(
        }
    }

-    let config = json_request::<Config>(&mut r).await?;
+    let config = json_request::<Config>(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;

    let usage = Usage {
        config,
        freed_bytes: 0,
    };

+    let (tx, rx) = tokio::sync::oneshot::channel();
+
    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.as_ref() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

-    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state, storage, usage, &cancel,
-    )
-    .await;
+    let cancel = CancellationToken::new();
+    let child_cancel = cancel.clone();
+    let _g = cancel.drop_guard();

-    info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+    crate::task_mgr::spawn(
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "ondemand disk usage eviction",
+        false,
+        async move {
+            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+                &state,
+                usage,
+                &child_cancel,
+            )
+            .await;

-    let res = res.map_err(ApiError::InternalServerError)?;
+            info!(?res, "disk_usage_eviction_task_iteration_impl finished");

-    json_response(StatusCode::OK, res)
-}
+            let _ = tx.send(res);
+            Ok(())
+        }
+        .in_current_span(),
+    );

-async fn secondary_upload_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .upload_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;

-    json_response(StatusCode::OK, ())
+    json_response(StatusCode::OK, response)
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1876,25 +1799,23 @@ pub fn make_router(
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
-        .get("/v1/tenant/:tenant_shard_id", |r| {
-            api_handler(r, tenant_status)
-        })
+        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
        .delete("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_delete_handler)
        })
-        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
+        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
-        .get("/v1/tenant/:tenant_shard_id/config", |r| {
+        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
-        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
+        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1906,83 +1827,73 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
-        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
-            api_handler(r, tenant_reset_handler)
-        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
-        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
+            api_handler(r, timeline_gc_handler)
+        })
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
+            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
+        })
        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
-            |r| api_handler(r, timeline_gc_handler),
-        )
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
-        )
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
+            api_handler(r, layer_map_info_handler)
+        })
        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
-            |r| api_handler(r, layer_map_info_handler),
-        )
-        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, layer_download_handler),
        )
        .delete(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
-            api_handler(r, secondary_upload_handler)
-        })
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .put("/v1/tenant/:tenant_shard_id/break", |r| {
+        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
+            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
+        })
        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
-        )
-        .get(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
        )
        .any(handler_404))
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,8 +2,9 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
+use std::pin::Pin;
+use std::task::{self, Poll};

 use anyhow::{bail, ensure, Context, Result};
 use async_compression::tokio::bufread::ZstdDecoder;
@@ -12,8 +13,7 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::fs::{File, OpenOptions};
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -629,16 +629,70 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    Ok(Bytes::from(buf))
 }

-pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tmp_path)
-        .await
-        .with_context(|| format!("tempfile creation {tmp_path}"))?;
+/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
+///
+/// The number of yields is bounded by above by the number of times poll_write is called,
+/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
+/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
+/// breathing room between units of CPU intensive preparation of buffers to be written.
+/// Once a write call is issued, the whole buffer has been prepared already, so there is no
+/// gain in splitting up the memcopy further.
+struct YieldingVec {
+    yield_budget: usize,
+    // the buffer written into
+    buf: Vec<u8>,
+}

+impl YieldingVec {
+    fn new() -> Self {
+        Self {
+            yield_budget: 0,
+            buf: Vec::new(),
+        }
+    }
+    // Whether we should yield for a read operation of given size
+    fn should_yield(&mut self, add_buf_len: usize) -> bool {
+        // Set this limit to a small value so that we are a
+        // good async citizen and yield repeatedly (but not
+        // too often for many small writes to cause many yields)
+        const YIELD_DIST: usize = 1024;
+
+        let target_buf_len = self.buf.len() + add_buf_len;
+        let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
+        if self.yield_budget < target_buf_len {
+            self.yield_budget += add_buf_len;
+        }
+        ret
+    }
+}
+
+impl AsyncWrite for YieldingVec {
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        if self.should_yield(buf.len()) {
+            cx.waker().wake_by_ref();
+            return Poll::Pending;
+        }
+        self.get_mut().buf.extend_from_slice(buf);
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        _cx: &mut task::Context<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        Poll::Ready(Ok(()))
+    }
+}
+
+pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    let mut paths = Vec::new();
    for entry in WalkDir::new(pgdata_path) {
        let entry = entry?;
@@ -653,7 +707,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Resu
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
+        YieldingVec::new(),
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
@@ -671,14 +725,13 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Resu
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+    let compressed = zstd.into_inner();
+    let compressed_len = compressed.buf.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
    }
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
+    Ok(compressed.buf)
 }

 pub async fn extract_tar_zst(
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -27,8 +27,6 @@ pub mod walredo;

 pub mod failpoint_support;

-use std::sync::Arc;
-
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
@@ -188,15 +186,17 @@ pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
+
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
    pub background_jobs_can_start: utils::completion::Barrier,
-
-    /// Concurrency limit for attaching tenants during startup.  This limit does not
-    /// apply to tenants that a client tries to access: those proceed to attach as fast
-    /// as they can.
-    pub warmup_limit: Arc<tokio::sync::Semaphore>,
 }

 /// Time the future with a warning when it exceeds a threshold.
@@ -212,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -220,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -229,7 +229,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2,10 +2,9 @@ use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
+    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
+    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -286,63 +285,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) mod page_cache_eviction_metrics {
-    use std::num::NonZeroUsize;
-
-    use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
-    use once_cell::sync::Lazy;
-
-    #[derive(Clone, Copy)]
-    pub(crate) enum Outcome {
-        FoundSlotUnused { iters: NonZeroUsize },
-        FoundSlotEvicted { iters: NonZeroUsize },
-        ItersExceeded { iters: NonZeroUsize },
-    }
-
-    static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_page_cache_find_victim_iters_total",
-            "Counter for the number of iterations in the find_victim loop",
-            &["outcome"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_page_cache_find_victim_calls",
-            "Incremented at the end of each find_victim() call.\
-             Filter by outcome to get e.g., eviction rate.",
-            &["outcome"]
-        )
-        .unwrap()
-    });
-
-    pub(crate) fn observe(outcome: Outcome) {
-        macro_rules! dry {
-            ($label:literal, $iters:expr) => {{
-                static LABEL: &'static str = $label;
-                static ITERS_TOTAL: Lazy<IntCounter> =
-                    Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
-                static CALLS: Lazy<IntCounter> =
-                    Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
-                ITERS_TOTAL.inc_by(($iters.get()) as u64);
-                CALLS.inc();
-            }};
-        }
-        match outcome {
-            Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
-            Outcome::FoundSlotEvicted { iters } => {
-                dry!("found_evicted", iters)
-            }
-            Outcome::ItersExceeded { iters } => {
-                dry!("err_iters_exceeded", iters);
-                super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
-            }
-        }
-    }
-}
-
 pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -352,6 +294,14 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
    .expect("failed to define a metric")
 });

+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_cache_find_victim_iters_total",
+        "Counter for the number of iterations in the find_victim loop",
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -457,14 +407,16 @@ pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;

+    use crate::task_mgr::TaskKind;
+
    pub(crate) struct StartCalculation(IntCounterVec);
    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
        StartCalculation(
            register_int_counter_vec!(
                "pageserver_initial_logical_size_start_calculation",
                "Incremented each time we start an initial logical size calculation attempt. \
-                 The `circumstances` label provides some additional details.",
-                &["attempt", "circumstances"]
+                 The `task_kind` label is for the task kind that caused this attempt.",
+                &["attempt", "task_kind"]
            )
            .unwrap(),
        )
@@ -512,24 +464,19 @@ pub(crate) mod initial_logical_size {
        inc_drop_calculation: Option<IntCounter>,
    }

-    #[derive(strum_macros::IntoStaticStr)]
-    pub(crate) enum StartCircumstances {
-        EmptyInitial,
-        SkippedConcurrencyLimiter,
-        AfterBackgroundTasksRateLimit,
-    }
-
    impl StartCalculation {
-        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+        pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["first", task_kind_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
            }
        }
-        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+        pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["retry", task_kind_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
            }
@@ -651,7 +598,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
        "pageserver_evictions_with_low_residence_duration",
        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
         Residence duration is determined using the `residence_duration_data_source`.",
-        &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
    )
    .expect("failed to define a metric")
 });
@@ -684,54 +631,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });

-/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
-/// like how long it took to load.
-///
-/// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
-/// metrics are rather expensive, and usually fine grained stuff makes more sense
-/// at a timeline level than tenant level.
-pub(crate) struct TenantMetrics {
-    /// How long did tenants take to go from construction to active state?
-    pub(crate) activation: Histogram,
-    pub(crate) preload: Histogram,
-    pub(crate) attach: Histogram,
-
-    /// How many tenants are included in the initial startup of the pagesrever?
-    pub(crate) startup_scheduled: IntCounter,
-    pub(crate) startup_complete: IntCounter,
-}
-
-pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
-    TenantMetrics {
-    activation: register_histogram!(
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_tenant_activation_seconds",
        "Time taken by tenants to activate, in seconds",
        CRITICAL_OP_BUCKETS.into()
    )
-    .expect("Failed to register metric"),
-    preload: register_histogram!(
-        "pageserver_tenant_preload_seconds",
-        "Time taken by tenants to load remote metadata on startup/attach, in seconds",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register metric"),
-    attach: register_histogram!(
-        "pageserver_tenant_attach_seconds",
-        "Time taken by tenants to intialize, after remote metadata is already loaded",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register metric"),
-    startup_scheduled: register_int_counter!(
-        "pageserver_tenant_startup_scheduled",
-        "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
-    ).expect("Failed to register metric"),
-    startup_complete: register_int_counter!(
-        "pageserver_tenant_startup_complete",
-        "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
-         should eventually reach `pageserver_tenant_startup_scheduled_total`.  Does not include broken \
-         tenants: such cases will lead to this metric never reaching the scheduled count."
-    ).expect("Failed to register metric"),
-}
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
 });

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -755,16 +662,10 @@ impl EvictionsWithLowResidenceDurationBuilder {
        }
    }

-    fn build(
-        &self,
-        tenant_id: &str,
-        shard_id: &str,
-        timeline_id: &str,
-    ) -> EvictionsWithLowResidenceDuration {
+    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
            .get_metric_with_label_values(&[
                tenant_id,
-                shard_id,
                timeline_id,
                self.data_source,
                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -795,24 +696,21 @@ impl EvictionsWithLowResidenceDuration {
    pub fn change_threshold(
        &mut self,
        tenant_id: &str,
-        shard_id: &str,
        timeline_id: &str,
        new_threshold: Duration,
    ) {
        if new_threshold == self.threshold {
            return;
        }
-        let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
-            self.data_source,
-            new_threshold,
-        )
-        .build(tenant_id, shard_id, timeline_id);
+        let mut with_new =
+            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
+                .build(tenant_id, timeline_id);
        std::mem::swap(self, &mut with_new);
-        with_new.remove(tenant_id, shard_id, timeline_id);
+        with_new.remove(tenant_id, timeline_id);
    }

    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
-    fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
+    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
        let Some(_counter) = self.counter.take() else {
            return;
        };
@@ -821,7 +719,6 @@ impl EvictionsWithLowResidenceDuration {

        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
            tenant_id,
-            shard_id,
            timeline_id,
            self.data_source,
            &threshold,
@@ -874,7 +771,6 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 )]
 pub(crate) enum StorageIoOperation {
    Open,
-    OpenAfterReplace,
    Close,
    CloseByReplace,
    Read,
@@ -888,7 +784,6 @@ impl StorageIoOperation {
    pub fn as_str(&self) -> &'static str {
        match self {
            StorageIoOperation::Open => "open",
-            StorageIoOperation::OpenAfterReplace => "open-after-replace",
            StorageIoOperation::Close => "close",
            StorageIoOperation::CloseByReplace => "close-by-replace",
            StorageIoOperation::Read => "read",
@@ -943,25 +838,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) mod virtual_file_descriptor_cache {
-    use super::*;
-
-    pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
-        register_uint_gauge!(
-            "pageserver_virtual_file_descriptor_cache_size_max",
-            "Maximum number of open file descriptors in the cache."
-        )
-        .unwrap()
-    });
-
-    // SIZE_CURRENT: derive it like so:
-    // ```
-    // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
-    // -ignoring(operation)
-    // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
-    // ```
-}
-
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
@@ -1288,52 +1164,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });

-pub(crate) struct WalIngestMetrics {
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-});
-pub(crate) struct SecondaryModeMetrics {
-    pub(crate) upload_heatmap: IntCounter,
-    pub(crate) upload_heatmap_errors: IntCounter,
-    pub(crate) upload_heatmap_duration: Histogram,
-}
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
-    upload_heatmap: register_int_counter!(
-        "pageserver_secondary_upload_heatmap",
-        "Number of heatmaps written to remote storage by attached tenants"
-    )
-    .expect("failed to define a metric"),
-    upload_heatmap_errors: register_int_counter!(
-        "pageserver_secondary_upload_heatmap_errors",
-        "Failures writing heatmap to remote storage"
-    )
-    .expect("failed to define a metric"),
-    upload_heatmap_duration: register_histogram!(
-        "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the S3 client"
-    )
-    .expect("failed to define a metric"),
-});
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1384,16 +1214,25 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -1546,8 +1385,6 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
-    pub(crate) active_stderr_logger_tasks_started: IntCounter,
-    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }

 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1571,19 +1408,6 @@ impl Default for WalRedoProcessCounters {
            &["cause"],
        )
        .unwrap();
-
-        let active_stderr_logger_tasks_started = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_started_total",
-            "Number of active walredo stderr logger tasks that have started",
-        )
-        .unwrap();
-
-        let active_stderr_logger_tasks_finished = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_finished_total",
-            "Number of active walredo stderr logger tasks that have finished",
-        )
-        .unwrap();
-
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1591,8 +1415,6 @@ impl Default for WalRedoProcessCounters {
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
-            active_stderr_logger_tasks_started,
-            active_stderr_logger_tasks_finished,
        }
    }
 }
@@ -1667,7 +1489,6 @@ impl StorageTimeMetrics {
 #[derive(Debug)]
 pub struct TimelineMetrics {
    tenant_id: String,
-    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
@@ -1688,12 +1509,11 @@ pub struct TimelineMetrics {

 impl TimelineMetrics {
    pub fn new(
-        tenant_shard_id: &TenantShardId,
+        tenant_id: &TenantId,
        timeline_id: &TimelineId,
        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
    ) -> Self {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1730,12 +1550,11 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
-            .build(&tenant_id, &shard_id, &timeline_id);
+        let evictions_with_low_residence_duration =
+            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
-            shard_id,
            timeline_id,
            flush_time_histo,
            compact_time_histo,
@@ -1781,7 +1600,6 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
-        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1795,7 +1613,7 @@ impl Drop for TimelineMetrics {
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
-            .remove(tenant_id, shard_id, timeline_id);
+            .remove(tenant_id, timeline_id);

        // The following metrics are born outside of the TimelineMetrics lifecycle but still
        // removed at the end of it. The idea is to have the metrics outlive the
@@ -2253,14 +2071,9 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

-    // Tenant stats
-    Lazy::force(&TENANT);
-
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);

-    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
-
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -28,7 +28,7 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
 //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
 //!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,15 +83,13 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use pageserver_api::shard::TenantShardId;
-use utils::{id::TimelineId, lsn::Lsn};
-
-use crate::{
-    context::RequestContext,
-    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    repository::Key,
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
 };

+use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
+
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;

@@ -152,13 +150,7 @@ enum CacheKey {

 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
 struct MaterializedPageHashKey {
-    /// Why is this TenantShardId rather than TenantId?
-    ///
-    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
-    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
-    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
-    /// special-cased in some other way.
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    timeline_id: TimelineId,
    key: Key,
 }
@@ -382,7 +374,7 @@ impl PageCache {
    /// returned page.
    pub async fn lookup_materialized_page(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: &Key,
        lsn: Lsn,
@@ -399,7 +391,7 @@ impl PageCache {

        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
+                tenant_id,
                timeline_id,
                key: *key,
            },
@@ -440,7 +432,7 @@ impl PageCache {
    ///
    pub async fn memorize_materialized_page(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
        lsn: Lsn,
@@ -448,7 +440,7 @@ impl PageCache {
    ) -> anyhow::Result<()> {
        let cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
+                tenant_id,
                timeline_id,
                key,
            },
@@ -905,10 +897,8 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            page_cache_eviction_metrics::observe(
-                                page_cache_eviction_metrics::Outcome::ItersExceeded {
-                                    iters: iters.try_into().unwrap(),
-                                },
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
@@ -919,18 +909,8 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
-                    page_cache_eviction_metrics::observe(
-                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
-                            iters: iters.try_into().unwrap(),
-                        },
-                    );
-                } else {
-                    page_cache_eviction_metrics::observe(
-                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
-                            iters: iters.try_into().unwrap(),
-                        },
-                    );
                }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,23 +53,21 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

 /// Read the end of a tar archive.
 ///
@@ -401,19 +399,16 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Note that since one connection may contain getpage requests that target different
-        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
-        // that we look up here may not be the one that serves all the actual requests: we will double
-        // check the mapping of key->shard later before calling into Timeline for getpage requests.
+        // TODO(sharding): enumerate local tenant shards for this tenant, and select the one
+        // that should serve this request.
+
+        // Make request tracer if needed
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::First,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
-
-        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path =
@@ -571,7 +566,6 @@ impl PageServerHandler {
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::Zero,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -634,7 +628,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();

        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
@@ -813,49 +807,9 @@ impl PageServerHandler {
        }
        */

-        let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        } else {
-            // The Tenant shard we looked up at connection start does not hold this particular
-            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
-            // has multiple shards for the same tenant.
-            //
-            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
-            let timeline = match self
-                .get_active_tenant_timeline(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
-                    ShardSelector::Page(key),
-                )
-                .await
-            {
-                Ok(t) => t,
-                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                    // We already know this tenant exists in general, because we resolved it at
-                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
-                }
-                Err(e) => return Err(e.into()),
-            };
-
-            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
-            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -884,7 +838,7 @@ impl PageServerHandler {

        // check that the timeline exists
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -990,11 +944,9 @@ impl PageServerHandler {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            selector,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -1168,7 +1120,7 @@ where

            self.check_permission(Some(tenant_id))?;
            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .get_active_tenant_timeline(tenant_id, timeline_id)
                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1355,7 +1307,6 @@ where

            let tenant = get_active_tenant_with_timeout(
                tenant_id,
-                ShardSelector::Zero,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,6 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -283,10 +282,6 @@ impl Timeline {
    }

    /// Get a list of all existing relations in given tablespace and database.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn list_rels(
        &self,
        spcnode: Oid,
@@ -635,10 +630,6 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
@@ -822,7 +813,10 @@ impl<'a> DatadirModification<'a> {
        self.put(DBDIR_KEY, Value::Image(buf.into()));

        // Create AuxFilesDirectory
-        self.init_aux_dir()?;
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));

        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
@@ -930,7 +924,10 @@ impl<'a> DatadirModification<'a> {
            self.put(DBDIR_KEY, Value::Image(buf.into()));

            // Create AuxFilesDirectory as well
-            self.init_aux_dir()?;
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: HashMap::new(),
+            })?;
+            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1255,14 +1252,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-        Ok(())
-    }
-
    pub async fn put_file(
        &mut self,
        path: &str,
@@ -1325,7 +1314,7 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::new();
        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value, ctx).await?;
@@ -1370,10 +1359,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub(crate) fn is_empty(&self) -> bool {
-        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
-    }
-
    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
@@ -1585,7 +1570,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1769,13 +1754,6 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
@@ -1791,6 +1769,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

+fn is_rel_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
+
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -42,7 +42,6 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};

 use futures::FutureExt;
-use pageserver_api::shard::TenantShardId;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
@@ -52,7 +51,7 @@ use tracing::{debug, error, info, warn};

 use once_cell::sync::Lazy;

-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};

 use crate::shutdown_pageserver;

@@ -258,9 +257,6 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

-    /// See [`crate::tenant::secondary`].
-    SecondaryUploads,
-
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

@@ -321,7 +317,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -333,7 +329,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,
    name: &str,
    shutdown_process_on_error: bool,
@@ -349,7 +345,7 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_shard_id,
+        tenant_id,
        timeline_id,
        mutable: Mutex::new(MutableTaskState { join_handle: None }),
    });
@@ -428,28 +424,28 @@ async fn task_finish(
            Ok(Err(err)) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
+                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                }
            }
            Err(err) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
+                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                }
            }
@@ -471,11 +467,11 @@ async fn task_finish(
 ///
 /// Or to shut down all tasks for given timeline:
 ///
-///   shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
+///   shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
 ///
 pub async fn shutdown_tasks(
    kind: Option<TaskKind>,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,
 ) {
    let mut victim_tasks = Vec::new();
@@ -484,35 +480,35 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_id.is_none() || task.tenant_id == tenant_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_shard_id,
+                    task.tenant_id,
                    task.timeline_id,
                ));
            }
        }
    }

-    let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
+    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();

-    for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
+    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                if tenant_shard_id.is_none() {
+                if tenant_id.is_none() {
                    // there are quite few of these
                    info!(name = task.name, kind = ?task_kind, "stopping global task");
                } else {
                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -521,13 +517,12 @@ pub async fn shutdown_tasks(
            {
                // allow some time to elapse before logging to cut down the number of log
                // lines.
-                info!("waiting for task {} to shut down", task.name);
+                info!("waiting for {} to shut down", task.name);
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
                // - task errors are already logged in the wrapper
                let _ = join_handle.await;
-                info!("task {} completed", task.name);
            }
        } else {
            // Possibly one of:
@@ -561,14 +556,9 @@ pub async fn shutdown_watcher() {
 /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
 /// `tokio::task::JoinSet::spawn`.
 pub fn shutdown_token() -> CancellationToken {
-    let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
-
-    if cfg!(test) {
-        // in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
-        res.unwrap_or_default()
-    } else {
-        res.expect("shutdown_token() called in an unexpected task or thread")
-    }
+    SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
+        .expect("shutdown_token() called in an unexpected task or thread")
 }

 /// Has the current task been requested to shut down?
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -334,11 +334,6 @@ pub struct TenantConf {
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
    pub gc_feedback: bool,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    pub heatmap_period: Duration,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -419,11 +414,6 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub gc_feedback: Option<bool>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(with = "humantime_serde")]
-    #[serde(default)]
-    pub heatmap_period: Option<Duration>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -492,7 +482,6 @@ impl TenantConfOpt {
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
-            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
        }
    }
 }
@@ -530,7 +519,6 @@ impl Default for TenantConf {
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
            gc_feedback: false,
-            heatmap_period: Duration::ZERO,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -15,6 +15,7 @@ use crate::{
    context::RequestContext,
    task_mgr::{self, TaskKind},
    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    InitializationOrder,
 };

 use super::{
@@ -77,10 +78,8 @@ async fn create_remote_delete_mark(
    let data: &[u8] = &[];
    backoff::retry(
        || async {
-            let data = bytes::Bytes::from_static(data);
-            let stream = futures::stream::once(futures::future::ready(Ok(data)));
            remote_storage
-                .upload(stream, 0, &remote_mark_path, None)
+                .upload(data, 0, &remote_mark_path, None)
                .await
        },
        |_e| false,
@@ -391,6 +390,7 @@ impl DeleteTenantFlow {
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -400,7 +400,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(init_order, preload, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
@@ -463,7 +466,7 @@ impl DeleteTenantFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            Some(tenant_shard_id.tenant_id),
            None,
            "tenant_delete",
            false,
@@ -550,7 +553,7 @@ impl DeleteTenantFlow {
                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
                let barrier = {
                    let mut locked = tenants.write().unwrap();
-                    let removed = locked.remove(tenant.tenant_shard_id);
+                    let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);

                    // FIXME: we should not be modifying this from outside of mgr.rs.
                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,8 +2,7 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::key::Key;
-use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -28,7 +27,7 @@ use crate::control_plane_client::{
    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
+use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,6 +43,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
+use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -97,76 +97,49 @@ pub(crate) enum TenantsMap {
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }

+/// Helper for mapping shard-unaware functions to a sharding-aware map
+/// TODO(sharding): all users of this must be made shard-aware.
+fn exactly_one_or_none<'a>(
+    map: &'a BTreeMap<TenantShardId, TenantSlot>,
+    tenant_id: &TenantId,
+) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
+    let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
+
+    // Retrieve the first two slots in the range: if both are populated, we must panic because the caller
+    // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
+    let slot_a = slots.next();
+    let slot_b = slots.next();
+    match (slot_a, slot_b) {
+        (None, None) => None,
+        (Some(slot), None) => {
+            // Exactly one matching slot
+            Some(slot)
+        }
+        (Some(_slot_a), Some(_slot_b)) => {
+            // Multiple shards for this tenant: cannot handle this yet.
+            // TODO(sharding): callers of get() should be shard-aware.
+            todo!("Attaching multiple shards in teh same tenant to the same pageserver")
+        }
+        (None, Some(_)) => unreachable!(),
+    }
+}
+
 pub(crate) enum TenantsMapRemoveResult {
    Occupied(TenantSlot),
    Vacant,
    InProgress(utils::completion::Barrier),
 }

-/// When resolving a TenantId to a shard, we may be looking for the 0th
-/// shard, or we might be looking for whichever shard holds a particular page.
-pub(crate) enum ShardSelector {
-    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
-    /// ignore it.
-    Zero,
-    /// Pick the first shard we find for the TenantId
-    First,
-    /// Pick the shard that holds this key
-    Page(Key),
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
-            }
-        }
-    }
-
-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
-
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
+                // TODO(sharding): callers of get() should be shard-aware.
+                exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
            }
        }
    }
@@ -175,19 +148,25 @@ impl TenantsMap {
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
    /// slot if the enclosed tenant is shutdown.
-    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
        use std::collections::btree_map::Entry;
        match self {
            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
-                Entry::Occupied(entry) => match entry.get() {
-                    TenantSlot::InProgress(barrier) => {
-                        TenantsMapRemoveResult::InProgress(barrier.clone())
-                    }
-                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
-                },
-                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
-            },
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
+                match key {
+                    Some(key) => match m.entry(key) {
+                        Entry::Occupied(entry) => match entry.get() {
+                            TenantSlot::InProgress(barrier) => {
+                                TenantsMapRemoveResult::InProgress(barrier.clone())
+                            }
+                            _ => TenantsMapRemoveResult::Occupied(entry.remove()),
+                        },
+                        Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
+                    },
+                    None => TenantsMapRemoveResult::Vacant,
+                }
+            }
        }
    }

@@ -235,6 +214,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

+/// Create a directory, including parents.  This does no fsyncs and makes
+/// no guarantees about the persistence of the resulting metadata: for
+/// use when creating dirs for use as cache.
+async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
+    let mut dirs_to_create = Vec::new();
+    let mut path: &Utf8Path = path.as_ref();
+
+    // Figure out which directories we need to create.
+    loop {
+        let meta = tokio::fs::metadata(path).await;
+        match meta {
+            Ok(metadata) if metadata.is_dir() => break,
+            Ok(_) => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::AlreadyExists,
+                    format!("non-directory found in path: {path}"),
+                ));
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(e),
+        }
+
+        dirs_to_create.push(path);
+
+        match path.parent() {
+            Some(parent) => path = parent,
+            None => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    format!("can't find parent of path '{path}'"),
+                ));
+            }
+        }
+    }
+
+    // Create directories from parent to child.
+    for &path in dirs_to_create.iter().rev() {
+        tokio::fs::create_dir(path).await?;
+    }
+
+    Ok(())
+}
+
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -429,13 +451,6 @@ pub async fn init_tenant_mgr(
    let tenant_generations =
        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

-    tracing::info!(
-        "Attaching {} tenants at startup, {} at a time",
-        tenant_configs.len(),
-        init_order.warmup_limit.available_permits()
-    );
-    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
-
    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -500,14 +515,12 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-        let shard_identity = location_conf.shard;
        match tenant_spawn(
            conf,
            tenant_shard_id,
            &tenant_dir_path,
            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
            SpawnMode::Normal,
@@ -548,7 +561,6 @@ pub(crate) fn tenant_spawn(
    tenant_path: &Utf8Path,
    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
-    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
@@ -575,19 +587,12 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
+    info!("Attaching tenant {tenant_shard_id}");
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
        resources,
        location_conf,
-        shard_identity,
        init_order,
        tenants,
        mode,
@@ -757,14 +762,12 @@ pub(crate) async fn create_tenant(
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;

-    let shard_identity = location_conf.shard;
    let created_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Create,
@@ -794,16 +797,14 @@ pub(crate) async fn set_new_tenant_config(
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
 ) -> Result<(), SetNewTenantConfigError> {
-    // Legacy API: does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_shard_id, true)?;
+    let tenant = get_tenant(tenant_id, true)?;

    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
@@ -813,12 +814,6 @@ pub(crate) async fn set_new_tenant_config(
 }

 impl TenantManager {
-    /// Convenience function so that anyone with a TenantManager can get at the global configuration, without
-    /// having to pass it around everywhere as a separate object.
-    pub(crate) fn get_conf(&self) -> &'static PageServerConf {
-        self.conf
-    }
-
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
    pub(crate) fn get_attached_tenant_shard(
@@ -854,7 +849,17 @@ impl TenantManager {
        }
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> Result<(), DeleteTimelineError> {
+        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
+        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
+        Ok(())
+    }
+
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
@@ -967,7 +972,7 @@ impl TenantManager {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

@@ -983,7 +988,7 @@ impl TenantManager {
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&timelines_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

@@ -991,14 +996,12 @@ impl TenantManager {
                    .await
                    .map_err(SetNewTenantConfigError::Persist)?;

-                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
-                    shard_identity,
                    None,
                    self.tenants,
                    SpawnMode::Normal,
@@ -1013,95 +1016,6 @@ impl TenantManager {

        Ok(())
    }
-
-    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
-    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
-    /// dropped before re-attaching.
-    ///
-    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
-    /// where an issue is identified that would go away with a restart of the tenant.
-    ///
-    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
-    /// to respect the cancellation tokens used in normal shutdown().
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
-    pub(crate) async fn reset_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        drop_cache: bool,
-        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let Some(old_slot) = slot_guard.get_old_value() else {
-            anyhow::bail!("Tenant not found when trying to reset");
-        };
-
-        let Some(tenant) = old_slot.get_attached() else {
-            slot_guard.revert();
-            anyhow::bail!("Tenant is not in attached state");
-        };
-
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                anyhow::bail!("Cannot reset Tenant, already shutting down");
-            }
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        if drop_cache {
-            tracing::info!("Dropping local file cache");
-
-            match tokio::fs::read_dir(&timelines_path).await {
-                Err(e) => {
-                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
-                }
-                Ok(mut entries) => {
-                    while let Some(entry) = entries.next_entry().await? {
-                        tokio::fs::remove_dir_all(entry.path()).await?;
-                    }
-                }
-            }
-        }
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            self.tenants,
-            SpawnMode::Normal,
-            &ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(())
-    }
-
-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
-        let locked = self.tenants.read().unwrap();
-        match &*locked {
-            TenantsMap::Initializing => Vec::new(),
-            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
-                .values()
-                .filter_map(|slot| {
-                    slot.get_attached()
-                        .and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
-                })
-                .collect(),
-        }
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1126,11 +1040,14 @@ pub(crate) enum GetTenantError {
 ///
 /// This method is cancel-safe.
 pub(crate) fn get_tenant(
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
    let locked = TENANTS.read().unwrap();

+    // TODO(sharding): make all callers of get_tenant shard-aware
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

    match peek_slot {
@@ -1142,18 +1059,14 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                    Err(GetTenantError::NotActive(tenant_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => {
-            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-        }
-        None | Some(TenantSlot::Secondary) => {
-            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
-        }
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
+        None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
    }
 }

@@ -1187,7 +1100,6 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
-    shard_selector: ShardSelector,
    timeout: Duration,
    cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1196,17 +1108,15 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }

+    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
+    // to decide which shard services the request)
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;

-    let (wait_for, tenant_shard_id) = {
+    let wait_for = {
        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
-
        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
@@ -1216,10 +1126,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => {
-                        tenant.activate_now.notify_one();
-                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
-                    }
+                    _ => WaitFor::Tenant(tenant.clone()),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1227,9 +1134,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    tenant_id,
                )))
            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
+            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
            None => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                    tenant_id,
@@ -1273,10 +1178,28 @@ pub(crate) async fn get_active_tenant_with_timeout(
    };

    tracing::debug!("Waiting for tenant to enter active state...");
-    tenant
-        .wait_to_become_active(deadline.duration_since(Instant::now()))
-        .await?;
-    Ok(tenant)
+    match timeout_cancellable(
+        deadline.duration_since(Instant::now()),
+        cancel,
+        tenant.wait_to_become_active(),
+    )
+    .await
+    {
+        Ok(Ok(())) => Ok(tenant),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state: Some(latest_state),
+                    wait_time: timeout,
+                })
+            }
+        }
+        Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
+    }
 }

 pub(crate) async fn delete_tenant(
@@ -1296,7 +1219,8 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let mut slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1453,14 +1377,12 @@ pub(crate) async fn load_tenant(

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    let shard_identity = location_conf.shard;
    let new_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1511,8 +1433,7 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
-{
+pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
    let tenants = TENANTS.read().unwrap();
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1520,10 +1441,12 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
+        // TODO(sharding): make callers of this function shard-aware
+        .map(|(k, v)| (k.tenant_id, v))
        .collect())
 }

@@ -1549,14 +1472,12 @@ pub(crate) async fn attach_tenant(
    // TODO: tenant directory remains on disk if we bail out from here on.
    //       See https://github.com/neondatabase/neon/issues/4233

-    let shard_identity = location_conf.shard;
    let attached_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_dir,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1622,10 +1543,9 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
-    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1685,9 +1605,9 @@ impl SlotGuard {
        }
    }

-    /// Get any value that was present in the slot before we acquired ownership
+    /// Take any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&self) -> &Option<TenantSlot> {
+    fn get_old_value(&mut self) -> &Option<TenantSlot> {
        &self.old_value
    }

@@ -1905,7 +1825,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
@@ -2057,20 +1977,22 @@ use {
 };

 pub(crate) async fn immediate_gc(
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
    cancel: CancellationToken,
    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
    let guard = TENANTS.read().unwrap();
-
    let tenant = guard
-        .get(&tenant_shard_id)
+        .get(&tenant_id)
        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_shard_id}"))
+        .with_context(|| format!("tenant {tenant_id}"))
        .map_err(|e| ApiError::NotFound(e.into()))?;

+    // TODO(sharding): make callers of this function shard-aware
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();
@@ -2082,9 +2004,9 @@ pub(crate) async fn immediate_gc(
    task_mgr::spawn(
        &tokio::runtime::Handle::current(),
        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        Some(tenant_id),
        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
+        &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
        false,
        async move {
            fail::fail_point!("immediate_gc_task_pre");
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,7 +180,7 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

-pub(crate) mod download;
+mod download;
 pub mod index;
 mod upload;

@@ -254,9 +254,6 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";

-/// Default buffer size when interfacing with [`tokio::fs::File`].
-pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -1223,7 +1220,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                Some(self.tenant_shard_id.tenant_id),
                Some(self.timeline_id),
                "remote upload",
                false,
@@ -1604,23 +1601,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    pub(crate) fn get_layers_metadata(
-        &self,
-        layers: Vec<LayerFileName>,
-    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
-        let q = self.upload_queue.lock().unwrap();
-        let q = match &*q {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", q.as_str())
-            }
-            UploadQueue::Initialized(inner) => inner,
-        };
-
-        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
-
-        Ok(decorated.collect())
-    }
 }

 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1676,13 +1656,6 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

-pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
-pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
-        .expect("Failed to construct path")
-}
-
 /// Given the key of an index, parse out the generation part of the name
 pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -75,11 +75,12 @@ pub async fn download_layer_file<'a>(

    let (mut destination_file, bytes_amount) = download_retry(
        || async {
-            let destination_file = tokio::fs::File::create(&temp_file_path)
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file = fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
-            let download = storage
+            let mut download = storage
                .download(&remote_path)
                .await
                .with_context(|| {
@@ -89,14 +90,9 @@ pub async fn download_layer_file<'a>(
                })
                .map_err(DownloadError::Other)?;

-            let mut destination_file =
-                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
            let bytes_amount = tokio::time::timeout(
                MAX_DOWNLOAD_DURATION,
-                tokio::io::copy_buf(&mut reader, &mut destination_file),
+                tokio::io::copy(&mut download.download_stream, &mut destination_file),
            )
            .await
            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
@@ -107,8 +103,6 @@ pub async fn download_layer_file<'a>(
            })
            .map_err(DownloadError::Other)?;

-            let destination_file = destination_file.into_inner();
-
            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
@@ -226,22 +220,20 @@ async fn do_download_index_part(
    index_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    use futures::stream::StreamExt;
-
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
        || async {
-            let index_part_download = storage.download(&remote_path).await?;
+            let mut index_part_download = storage.download(&remote_path).await?;

            let mut index_part_bytes = Vec::new();
-            let mut stream = std::pin::pin!(index_part_download.download_stream);
-            while let Some(chunk) = stream.next().await {
-                let chunk = chunk
-                    .with_context(|| format!("download index part at {remote_path:?}"))
-                    .map_err(DownloadError::Other)?;
-                index_part_bytes.extend_from_slice(&chunk[..]);
-            }
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| format!("download index part at {remote_path:?}"))
+            .map_err(DownloadError::Other)?;
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
@@ -371,7 +363,7 @@ pub(super) async fn download_index_part(
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
+            tracing::info!("No index_part.json* found");
            do_download_index_part(
                storage,
                tenant_shard_id,
@@ -402,13 +394,11 @@ pub(crate) async fn download_initdb_tar_zst(
            .with_context(|| format!("timeline dir creation {timeline_path}"))
            .map_err(DownloadError::Other)?;
    }
-    let temp_path = timeline_path.join(format!(
-        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
-    ));
+    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));

    let file = download_retry(
        || async {
-            let file = OpenOptions::new()
+            let mut file = OpenOptions::new()
                .create(true)
                .truncate(true)
                .read(true)
@@ -418,17 +408,13 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = storage.download(&remote_path).await?;
-            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
-            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
+            let mut download = storage.download(&remote_path).await?;

-            tokio::io::copy_buf(&mut download, &mut writer)
+            tokio::io::copy(&mut download.download_stream, &mut file)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
                .map_err(DownloadError::Other)?;

-            let mut file = writer.into_inner();
-
            file.seek(std::io::SeekFrom::Start(0))
                .await
                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
@@ -440,10 +426,10 @@ pub(crate) async fn download_initdb_tar_zst(
    )
    .await
    .map_err(|e| {
-        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
-        // We don't have async here nor do we want to pile on any extra errors.
-        if let Err(e) = std::fs::remove_file(&temp_path) {
-            if e.kind() != std::io::ErrorKind::NotFound {
+        if temp_path.exists() {
+            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
+            // We don't have async here nor do we want to pile on any extra errors.
+            if let Err(e) = std::fs::remove_file(&temp_path) {
                warn!("error deleting temporary file {temp_path}: {e}");
            }
        }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,12 +1,12 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
+use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
-use std::io::{ErrorKind, SeekFrom};
-use tokio::fs::{self, File};
-use tokio::io::AsyncSeekExt;
+use std::io::ErrorKind;
+use tokio::fs;

 use super::Generation;
 use crate::{
@@ -41,15 +41,11 @@ pub(super) async fn upload_index_part<'a>(
        .to_s3_bytes()
        .context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
-    let index_part_bytes = bytes::Bytes::from(index_part_bytes);
+    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
-        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
-            index_part_size,
-            &remote_path,
-        )
+        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
@@ -105,10 +101,8 @@ pub(super) async fn upload_timeline_layer<'a>(
    let fs_size = usize::try_from(fs_size)
        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;

-    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
-
    storage
-        .upload(reader, fs_size, &storage_path, None)
+        .upload(source_file, fs_size, &storage_path, None)
        .await
        .with_context(|| format!("upload layer from local path '{source_path}'"))?;

@@ -120,19 +114,16 @@ pub(crate) async fn upload_initdb_dir(
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-    mut initdb_tar_zst: File,
-    size: u64,
+    initdb_dir: Bytes,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading initdb dir");

-    // We might have read somewhat into the file already in the prior retry attempt
-    initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
-
-    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
+    let size = initdb_dir.len();
+    let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
    storage
-        .upload_storage_object(file, size as usize, &remote_path)
+        .upload_storage_object(bytes, size, &remote_path)
        .await
        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,104 +0,0 @@
-pub mod heatmap;
-mod heatmap_uploader;
-
-use std::sync::Arc;
-
-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-
-use self::heatmap_uploader::heatmap_uploader_task;
-
-use super::mgr::TenantManager;
-
-use pageserver_api::shard::TenantShardId;
-use remote_storage::GenericRemoteStorage;
-
-use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
-
-enum UploadCommand {
-    Upload(TenantShardId),
-}
-
-struct CommandRequest<T> {
-    payload: T,
-    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-}
-
-struct CommandResponse {
-    result: anyhow::Result<()>,
-}
-
-/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
-/// uploads & downloads are autonomous and not driven by this interface.
-pub struct SecondaryController {
-    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
-}
-
-impl SecondaryController {
-    async fn dispatch<T>(
-        &self,
-        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
-        payload: T,
-    ) -> anyhow::Result<()> {
-        let (response_tx, response_rx) = tokio::sync::oneshot::channel();
-
-        queue
-            .send(CommandRequest {
-                payload,
-                response_tx,
-            })
-            .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
-
-        let response = response_rx
-            .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
-
-        response.result
-    }
-
-    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
-            .await
-    }
-}
-
-pub fn spawn_tasks(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> SecondaryController {
-    let (upload_req_tx, upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryUploads,
-        None,
-        None,
-        "heatmap uploads",
-        false,
-        async move {
-            heatmap_uploader_task(
-                tenant_manager,
-                remote_storage,
-                upload_req_rx,
-                background_jobs_can_start,
-                cancel,
-            )
-            .await
-        },
-    );
-
-    SecondaryController { upload_req_tx }
-}
-
-/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
-pub fn null_controller() -> SecondaryController {
-    let (upload_req_tx, _upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
-}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,64 +0,0 @@
-use std::time::SystemTime;
-
-use crate::tenant::{
-    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
-};
-
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
-
-use utils::{generation::Generation, id::TimelineId};
-
-#[derive(Serialize, Deserialize)]
-pub(super) struct HeatMapTenant {
-    /// Generation of the attached location that uploaded the heatmap: this is not required
-    /// for correctness, but acts as a hint to secondary locations in order to detect thrashing
-    /// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
-    pub(super) generation: Generation,
-
-    pub(super) timelines: Vec<HeatMapTimeline>,
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub(crate) struct HeatMapTimeline {
-    #[serde_as(as = "DisplayFromStr")]
-    pub(super) timeline_id: TimelineId,
-
-    pub(super) layers: Vec<HeatMapLayer>,
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerFileName,
-    pub(super) metadata: IndexLayerMetadata,
-
-    #[serde_as(as = "TimestampSeconds<i64>")]
-    pub(super) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
-}
-
-impl HeatMapLayer {
-    pub(crate) fn new(
-        name: LayerFileName,
-        metadata: IndexLayerMetadata,
-        access_time: SystemTime,
-    ) -> Self {
-        Self {
-            name,
-            metadata,
-            access_time,
-        }
-    }
-}
-
-impl HeatMapTimeline {
-    pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
-        Self {
-            timeline_id,
-            layers,
-        }
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,582 +0,0 @@
-use std::{
-    collections::HashMap,
-    sync::{Arc, Weak},
-    time::{Duration, Instant},
-};
-
-use crate::{
-    metrics::SECONDARY_MODE,
-    tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
-    },
-};
-
-use md5;
-use pageserver_api::shard::TenantShardId;
-use remote_storage::GenericRemoteStorage;
-
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::instrument;
-use utils::{backoff, completion::Barrier};
-
-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
-
-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
-
-struct WriteInProgress {
-    barrier: Barrier,
-}
-
-struct UploadPending {
-    tenant: Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
-}
-
-struct WriteComplete {
-    tenant_shard_id: TenantShardId,
-    completed_at: Instant,
-    digest: Option<md5::Digest>,
-    next_upload: Option<Instant>,
-}
-
-/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
-/// when we last did a write.  We only populate this after doing at least one
-/// write for a tenant -- this avoids holding state for tenants that have
-/// uploads disabled.
-
-struct UploaderTenantState {
-    // This Weak only exists to enable culling idle instances of this type
-    // when the Tenant has been deallocated.
-    tenant: Weak<Tenant>,
-
-    /// Digest of the serialized heatmap that we last successfully uploaded
-    ///
-    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-    /// which is also an md5sum.
-    last_digest: Option<md5::Digest>,
-
-    /// When the last upload attempt completed (may have been successful or failed)
-    last_upload: Option<Instant>,
-
-    /// When should we next do an upload?  None means never.
-    next_upload: Option<Instant>,
-}
-
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
-
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
-}
-
-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
-        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
-        // Cull any entries in self.tenants whose Arc<Tenant> is gone
-        self.tenants
-            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
-
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
-        let now = Instant::now();
-
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
-
-        // Iterate over tenants looking for work to do.
-        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
-
-            self.maybe_schedule_upload(&now, tenant);
-
-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
-                return;
-            }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
-            }
-        }
-
-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
-            });
-
-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
-
-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
-            tenant,
-            last_digest,
-        })
-    }
-
-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
-        let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
-        let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
-            // Guard for the barrier in [`WriteInProgress`]
-            let _completion = completion;
-
-            let started_at = Instant::now();
-            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
-                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
-                    let duration = Instant::now().duration_since(started_at);
-                    SECONDARY_MODE
-                        .upload_heatmap_duration
-                        .observe(duration.as_secs_f64());
-                    SECONDARY_MODE.upload_heatmap.inc();
-                    Some(digest)
-                }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
-                Err(UploadHeatmapError::Upload(e)) => {
-                    tracing::warn!(
-                        "Failed to upload heatmap for tenant {}: {e:#}",
-                        tenant.get_tenant_shard_id(),
-                    );
-                    let duration = Instant::now().duration_since(started_at);
-                    SECONDARY_MODE
-                        .upload_heatmap_duration
-                        .observe(duration.as_secs_f64());
-                    SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_digest
-                }
-                Err(UploadHeatmapError::Cancelled) => {
-                    tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_digest
-                }
-            };
-
-            let now = Instant::now();
-            let next_upload = tenant
-                .get_heatmap_period()
-                .and_then(|period| now.checked_add(period));
-
-            result_tx
-                .send(WriteComplete {
-                    tenant_shard_id: *tenant.get_tenant_shard_id(),
-                    completed_at: now,
-                    digest,
-                    next_upload,
-                })
-                .ok();
-        });
-
-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
-    }
-
-    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
-    fn on_completion(&mut self, completion: WriteComplete) {
-        tracing::debug!("Heatmap upload completed");
-        let WriteComplete {
-            tenant_shard_id,
-            completed_at,
-            digest,
-            next_upload,
-        } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
-        use std::collections::hash_map::Entry;
-        match self.tenants.entry(tenant_shard_id) {
-            Entry::Vacant(_) => {
-                // Tenant state was dropped, nothing to update.
-            }
-            Entry::Occupied(mut entry) => {
-                entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_digest = digest;
-                entry.get_mut().next_upload = next_upload
-            }
-        }
-    }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
-}
-
-enum UploadHeatmapOutcome {
-    /// We successfully wrote to remote storage, with this digest.
-    Uploaded(md5::Digest),
-    /// We did not upload because the heatmap digest was unchanged since the last upload
-    NoChange,
-    /// We skipped the upload for some reason, such as tenant/timeline not ready
-    Skipped,
-}
-
-#[derive(thiserror::Error, Debug)]
-enum UploadHeatmapError {
-    #[error("Cancelled")]
-    Cancelled,
-
-    #[error(transparent)]
-    Upload(#[from] anyhow::Error),
-}
-
-/// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
-/// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
-async fn upload_tenant_heatmap(
-    remote_storage: GenericRemoteStorage,
-    tenant: &Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
-) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
-    debug_assert_current_span_has_tenant_id();
-
-    let generation = tenant.get_generation();
-    if generation.is_none() {
-        // We do not expect this: generations were implemented before heatmap uploads.  However,
-        // handle it so that we don't have to make the generation in the heatmap an Option<>
-        // (Generation::none is not serializable)
-        tracing::warn!("Skipping heatmap upload for tenant with generation==None");
-        return Ok(UploadHeatmapOutcome::Skipped);
-    }
-
-    let mut heatmap = HeatMapTenant {
-        timelines: Vec::new(),
-        generation,
-    };
-    let timelines = tenant.timelines.lock().unwrap().clone();
-
-    let tenant_cancel = tenant.cancel.clone();
-
-    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
-    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
-    // in remote storage.
-    let _guard = match tenant.gate.enter() {
-        Ok(g) => g,
-        Err(_) => {
-            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
-            return Err(UploadHeatmapError::Cancelled);
-        }
-    };
-
-    for (timeline_id, timeline) in timelines {
-        let heatmap_timeline = timeline.generate_heatmap().await;
-        match heatmap_timeline {
-            None => {
-                tracing::debug!(
-                    "Skipping heatmap upload because timeline {timeline_id} is not ready"
-                );
-                return Ok(UploadHeatmapOutcome::Skipped);
-            }
-            Some(heatmap_timeline) => {
-                heatmap.timelines.push(heatmap_timeline);
-            }
-        }
-    }
-
-    // Serialize the heatmap
-    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let size = bytes.len();
-
-    // Drop out early if nothing changed since our last upload
-    let digest = md5::compute(&bytes);
-    if Some(digest) == last_digest {
-        return Ok(UploadHeatmapOutcome::NoChange);
-    }
-
-    let path = remote_heatmap_path(tenant.get_tenant_shard_id());
-
-    // Write the heatmap.
-    tracing::debug!("Uploading {size} byte heatmap to {path}");
-    if let Err(e) = backoff::retry(
-        || async {
-            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
-                bytes.clone(),
-            ))));
-            remote_storage
-                .upload_storage_object(bytes, size, &path)
-                .await
-        },
-        |_| false,
-        3,
-        u32::MAX,
-        "Uploading heatmap",
-        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
-    )
-    .await
-    {
-        if tenant_cancel.is_cancelled() {
-            return Err(UploadHeatmapError::Cancelled);
-        } else {
-            return Err(e.into());
-        }
-    }
-
-    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
-
-    Ok(UploadHeatmapOutcome::Uploaded(digest))
-}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,7 +4,7 @@ pub mod delta_layer;
 mod filename;
 pub mod image_layer;
 mod inmemory_layer;
-pub(crate) mod layer;
+mod layer;
 mod layer_desc;

 use crate::context::{AccessStatsBehavior, RequestContext};
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -222,18 +222,14 @@ impl Layer {
    ///
    /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
    /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
-    pub(crate) fn delete_on_drop(&self) {
-        self.0.delete_on_drop();
+    pub(crate) fn garbage_collect_on_drop(&self) {
+        self.0.garbage_collect_on_drop();
    }

    /// Return data needed to reconstruct given page at LSN.
    ///
    /// It is up to the caller to collect more data from the previous layer and
    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
@@ -331,10 +327,10 @@ impl Layer {
        Ok(())
    }

-    /// Waits until this layer has been dropped (and if needed, local file deletion and remote
+    /// Waits until this layer has been dropped (and if needed, local garbage collection and remote
    /// deletion scheduling has completed).
    ///
-    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
+    /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
    /// separatedly.
    #[cfg(feature = "testing")]
    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -423,8 +419,8 @@ struct LayerInner {
    /// Initialization and deinitialization are done while holding a permit.
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,

-    /// Do we want to delete locally and remotely this when `LayerInner` is dropped
-    wanted_deleted: AtomicBool,
+    /// Do we want to garbage collect this when `LayerInner` is dropped
+    wanted_garbage_collected: AtomicBool,

    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -438,6 +434,10 @@ struct LayerInner {
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
+    ///
+    /// If in future we need to implement "wait until layer instances are gone and done", carrying
+    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
+    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -457,8 +457,6 @@ struct LayerInner {
    /// For loaded layers, this may be some other value if the tenant has undergone
    /// a shard split since the layer was originally written.
    shard: ShardIndex,
-
-    last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
 }

 impl std::fmt::Display for LayerInner {
@@ -481,14 +479,14 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        if !*self.wanted_deleted.get_mut() {
+        if !*self.wanted_garbage_collected.get_mut() {
            // should we try to evict if the last wish was for eviction?
            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
            // run drops during shutdown (yet)
            return;
        }

-        let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
+        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().filename();
@@ -515,8 +513,8 @@ impl Drop for LayerInner {
                    false
                }
                Err(e) => {
-                    tracing::error!("failed to remove wanted deleted layer: {e}");
-                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
+                    tracing::error!("failed to remove garbage collected layer: {e}");
+                    LAYER_IMPL_METRICS.inc_gc_removes_failed();
                    false
                }
            };
@@ -538,15 +536,15 @@ impl Drop for LayerInner {
                        } else {
                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
                        }
-                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                        LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
                    } else {
-                        LAYER_IMPL_METRICS.inc_completed_deletes();
+                        LAYER_IMPL_METRICS.inc_completed_gcs();
                    }
                }
            } else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
            }
        });
    }
@@ -581,7 +579,7 @@ impl LayerInner {
            timeline: Arc::downgrade(timeline),
            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
-            wanted_deleted: AtomicBool::new(false),
+            wanted_garbage_collected: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
@@ -589,17 +587,19 @@ impl LayerInner {
            consecutive_failures: AtomicUsize::new(0),
            generation,
            shard,
-            last_evicted_at: std::sync::Mutex::default(),
        }
    }

-    fn delete_on_drop(&self) {
-        let res =
-            self.wanted_deleted
-                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
+    fn garbage_collect_on_drop(&self) {
+        let res = self.wanted_garbage_collected.compare_exchange(
+            false,
+            true,
+            Ordering::Release,
+            Ordering::Relaxed,
+        );

        if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_deletes();
+            LAYER_IMPL_METRICS.inc_started_gcs();
        }
    }

@@ -667,10 +667,6 @@ impl LayerInner {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));

@@ -679,8 +675,6 @@ impl LayerInner {
                    .upgrade()
                    .ok_or_else(|| DownloadError::TimelineShutdown)?;

-                // FIXME: grab a gate
-
                let can_ever_evict = timeline.remote_client.as_ref().is_some();

                // check if we really need to be downloaded; could have been already downloaded by a
@@ -725,14 +719,6 @@ impl LayerInner {
                    permit
                };

-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-                }
-
                let res = Arc::new(DownloadedLayer {
                    owner: Arc::downgrade(self),
                    kind: tokio::sync::OnceCell::default(),
@@ -749,8 +735,6 @@ impl LayerInner {
                    tracing::info!(waiters, "completing the on-demand download for other tasks");
                }

-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };

@@ -848,7 +832,7 @@ impl LayerInner {
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id),
+            Some(self.desc.tenant_shard_id.tenant_id),
            Some(self.desc.timeline_id),
            &task_name,
            false,
@@ -879,13 +863,14 @@ impl LayerInner {
                    match res {
                        (Ok(()), _) => {
                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded.
+                            // else requests the layer, they'll find it already downloaded
+                            // or redownload.
                            //
-                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
-                            //
-                            // FIXME(#6028): however, could be that we should consider marking the
-                            // layer for eviction? alas, cannot: because only DownloadedLayer will
-                            // handle that.
+                            // however, could be that we should consider marking the layer
+                            // for eviction? alas, cannot: because only DownloadedLayer
+                            // will handle that.
+                            tracing::info!("layer file download completed after requester had cancelled");
+                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
                        (Err(e), _) => {
                            // our caller is cancellation safe, but we might be racing with
@@ -1005,15 +990,12 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let delete = self.wanted_deleted.load(Ordering::Acquire);
+        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;

-        if delete {
-            // do nothing now, only in LayerInner::drop -- this was originally implemented because
-            // we could had already scheduled the deletion at the time.
-            //
-            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
+        if gc {
+            // do nothing now, only in LayerInner::drop
        } else if can_evict && evict {
            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);

@@ -1028,7 +1010,7 @@ impl LayerInner {
            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                let _g = span.entered();

-                // if LayerInner is already dropped here, do nothing because the delete on drop
+                // if LayerInner is already dropped here, do nothing because the garbage collection
                // has already ran while we were in queue
                let Some(this) = this.upgrade() else {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1128,8 +1110,6 @@ impl LayerInner {
        // we are still holding the permit, so no new spawn_download_and_wait can happen
        drop(self.status.send(Status::Evicted));

-        *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
-
        res
    }

@@ -1421,39 +1401,36 @@ impl From<ResidentLayer> for Layer {
    }
 }

-use metrics::IntCounter;
+use metrics::{IntCounter, IntCounterVec};

-pub(crate) struct LayerImplMetrics {
+struct LayerImplMetrics {
    started_evictions: IntCounter,
    completed_evictions: IntCounter,
-    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
+    cancelled_evictions: IntCounterVec,

-    started_deletes: IntCounter,
-    completed_deletes: IntCounter,
-    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
+    started_gcs: IntCounter,
+    completed_gcs: IntCounter,
+    failed_gcs: IntCounterVec,

-    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
-    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
-    redownload_after: metrics::Histogram,
+    rare_counters: IntCounterVec,
 }

 impl Default for LayerImplMetrics {
    fn default() -> Self {
-        use enum_map::Enum;
-
-        // reminder: these will be pageserver_layer_* with "_total" suffix
-
-        let started_evictions = metrics::register_int_counter!(
-            "pageserver_layer_started_evictions",
-            "Evictions started in the Layer implementation"
-        )
-        .unwrap();
-        let completed_evictions = metrics::register_int_counter!(
-            "pageserver_layer_completed_evictions",
-            "Evictions completed in the Layer implementation"
+        let evictions = metrics::register_int_counter_vec!(
+            "pageserver_layer_evictions_count",
+            "Evictions started and completed in the Layer implementation",
+            &["state"]
        )
        .unwrap();

+        let started_evictions = evictions
+            .get_metric_with_label_values(&["started"])
+            .unwrap();
+        let completed_evictions = evictions
+            .get_metric_with_label_values(&["completed"])
+            .unwrap();
+
        let cancelled_evictions = metrics::register_int_counter_vec!(
            "pageserver_layer_cancelled_evictions_count",
            "Different reasons for evictions to have been cancelled or failed",
@@ -1461,36 +1438,24 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

-        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = EvictionCancelled::from_usize(i);
-            let s = reason.as_str();
-            cancelled_evictions.with_label_values(&[s])
-        }));
-
-        let started_deletes = metrics::register_int_counter!(
-            "pageserver_layer_started_deletes",
-            "Deletions on drop pending in the Layer implementation"
-        )
-        .unwrap();
-        let completed_deletes = metrics::register_int_counter!(
-            "pageserver_layer_completed_deletes",
-            "Deletions on drop completed in the Layer implementation"
+        // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
+        let gcs = metrics::register_int_counter_vec!(
+            "pageserver_layer_gcs_count",
+            "Garbage collections started and completed in the Layer implementation",
+            &["state"]
        )
        .unwrap();

-        let failed_deletes = metrics::register_int_counter_vec!(
-            "pageserver_layer_failed_deletes_count",
-            "Different reasons for deletions on drop to have failed",
+        let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
+        let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
+
+        let failed_gcs = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_gcs_count",
+            "Different reasons for garbage collections to have failed",
            &["reason"]
        )
        .unwrap();

-        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = DeleteFailed::from_usize(i);
-            let s = reason.as_str();
-            failed_deletes.with_label_values(&[s])
-        }));
-
        let rare_counters = metrics::register_int_counter_vec!(
            "pageserver_layer_assumed_rare_count",
            "Times unexpected or assumed rare event happened",
@@ -1498,50 +1463,16 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

-        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let event = RareEvent::from_usize(i);
-            let s = event.as_str();
-            rare_counters.with_label_values(&[s])
-        }));
-
-        let inits_cancelled = metrics::register_int_counter!(
-            "pageserver_layer_inits_cancelled_count",
-            "Times Layer initialization was cancelled",
-        )
-        .unwrap();
-
-        let redownload_after = {
-            let minute = 60.0;
-            let hour = 60.0 * minute;
-            metrics::register_histogram!(
-                "pageserver_layer_redownloaded_after",
-                "Time between evicting and re-downloading.",
-                vec![
-                    10.0,
-                    30.0,
-                    minute,
-                    5.0 * minute,
-                    15.0 * minute,
-                    30.0 * minute,
-                    hour,
-                    12.0 * hour,
-                ]
-            )
-            .unwrap()
-        };
-
        Self {
            started_evictions,
            completed_evictions,
            cancelled_evictions,

-            started_deletes,
-            completed_deletes,
-            failed_deletes,
+            started_gcs,
+            completed_gcs,
+            failed_gcs,

            rare_counters,
-            inits_cancelled,
-            redownload_after,
        }
    }
 }
@@ -1554,33 +1485,57 @@ impl LayerImplMetrics {
        self.completed_evictions.inc();
    }
    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions[reason].inc()
+        self.cancelled_evictions
+            .get_metric_with_label_values(&[reason.as_str()])
+            .unwrap()
+            .inc()
    }

-    fn inc_started_deletes(&self) {
-        self.started_deletes.inc();
+    fn inc_started_gcs(&self) {
+        self.started_gcs.inc();
    }
-    fn inc_completed_deletes(&self) {
-        self.completed_deletes.inc();
+    fn inc_completed_gcs(&self) {
+        self.completed_gcs.inc();
    }
-    fn inc_deletes_failed(&self, reason: DeleteFailed) {
-        self.failed_deletes[reason].inc();
+    fn inc_gcs_failed(&self, reason: GcFailed) {
+        self.failed_gcs
+            .get_metric_with_label_values(&[reason.as_str()])
+            .unwrap()
+            .inc();
    }

-    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
-    /// attempt regardless of failure to delete local file.
-    fn inc_delete_removes_failed(&self) {
-        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
+    /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
+    /// failure to delete local file.
+    fn inc_gc_removes_failed(&self) {
+        self.rare_counters
+            .get_metric_with_label_values(&["gc_remove_failed"])
+            .unwrap()
+            .inc();
    }

-    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
+    /// Expected rare because requires a race with `evict_blocking` and
+    /// `get_or_maybe_download`.
    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["retried_gomd"])
+            .unwrap()
+            .inc();
    }

-    /// Expected rare because cancellations are unexpected, and failures are unexpected
+    /// Expected rare because cancellations are unexpected
+    fn inc_download_completed_without_requester(&self) {
+        self.rare_counters
+            .get_metric_with_label_values(&["download_completed_without"])
+            .unwrap()
+            .inc();
+    }
+
+    /// Expected rare because cancellations are unexpected
    fn inc_download_failed_without_requester(&self) {
-        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["download_failed_without"])
+            .unwrap()
+            .inc();
    }

    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1588,34 +1543,37 @@ impl LayerImplMetrics {
    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    /// Option.
    fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["raced_wanted_evicted"])
+            .unwrap()
+            .inc();
    }

-    /// These are only expected for [`Self::inc_init_cancelled`] amount when
+    /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
    /// running with remote storage.
    fn inc_init_needed_no_download(&self) {
-        self.rare_counters[RareEvent::InitWithoutDownload].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["init_needed_no_download"])
+            .unwrap()
+            .inc();
    }

    /// Expected rare because all layer files should be readable and good
    fn inc_permanent_loading_failures(&self) {
-        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["permanent_loading_failure"])
+            .unwrap()
+            .inc();
    }

    fn inc_broadcast_lagged(&self) {
-        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
-    }
-
-    fn inc_init_cancelled(&self) {
-        self.inits_cancelled.inc()
-    }
-
-    fn record_redownloaded_after(&self, duration: std::time::Duration) {
-        self.redownload_after.observe(duration.as_secs_f64())
+        self.rare_counters
+            .get_metric_with_label_values(&["broadcast_lagged"])
+            .unwrap()
+            .inc();
    }
 }

-#[derive(enum_map::Enum)]
 enum EvictionCancelled {
    LayerGone,
    TimelineGone,
@@ -1644,47 +1602,19 @@ impl EvictionCancelled {
    }
 }

-#[derive(enum_map::Enum)]
-enum DeleteFailed {
+enum GcFailed {
    TimelineGone,
    DeleteSchedulingFailed,
 }

-impl DeleteFailed {
+impl GcFailed {
    fn as_str(&self) -> &'static str {
        match self {
-            DeleteFailed::TimelineGone => "timeline_gone",
-            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+            GcFailed::TimelineGone => "timeline_gone",
+            GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
        }
    }
 }

-#[derive(enum_map::Enum)]
-enum RareEvent {
-    RemoveOnDropFailed,
-    RetriedGetOrMaybeDownload,
-    DownloadFailedWithoutRequester,
-    UpgradedWantedEvicted,
-    InitWithoutDownload,
-    PermanentLoadingFailure,
-    EvictAndWaitLagged,
-}
-
-impl RareEvent {
-    fn as_str(&self) -> &'static str {
-        use RareEvent::*;
-
-        match self {
-            RemoveOnDropFailed => "remove_on_drop_failed",
-            RetriedGetOrMaybeDownload => "retried_gomd",
-            DownloadFailedWithoutRequester => "download_failed_without",
-            UpgradedWantedEvicted => "raced_wanted_evicted",
-            InitWithoutDownload => "init_needed_no_download",
-            PermanentLoadingFailure => "permanent_loading_failure",
-            EvictAndWaitLagged => "broadcast_lagged",
-        }
-    }
-}
-
-pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
    Eviction,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
-    InitialLogicalSizeCalculation,
 }

 impl BackgroundLoopKind {
@@ -54,18 +53,31 @@ impl BackgroundLoopKind {
    }
 }

-/// Cancellation safe.
-pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
+pub(crate) enum RateLimitError {
+    Cancelled,
+}
+
+pub(crate) async fn concurrent_background_tasks_rate_limit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> impl Drop {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
+    cancel: &CancellationToken,
+) -> Result<impl Drop, RateLimitError> {
+    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
        .with_label_values(&[loop_kind.as_static_str()])
-        .guard();
-
-    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
-        Ok(permit) => permit,
-        Err(_closed) => unreachable!("we never close the semaphore"),
+        .inc();
+    scopeguard::defer!(
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
+    );
+    tokio::select! {
+        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+            match permit {
+                Ok(permit) => Ok(permit),
+                Err(_closed) => unreachable!("we never close the semaphore"),
+            }
+        },
+        _ = cancel.cancelled() => {
+            Err(RateLimitError::Cancelled)
+        }
    }
 }

@@ -74,13 +86,13 @@ pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
-    let tenant_shard_id = tenant.tenant_shard_id;
+    let tenant_id = tenant.tenant_shard_id.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        Some(tenant_shard_id),
+        Some(tenant_id),
        None,
-        &format!("compactor for tenant {tenant_shard_id}"),
+        &format!("compactor for tenant {tenant_id}"),
        false,
        {
            let tenant = Arc::clone(tenant);
@@ -92,7 +104,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                compaction_loop(tenant, cancel)
-                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
            }
@@ -101,9 +113,9 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        Some(tenant_id),
        None,
-        &format!("garbage collector for tenant {tenant_shard_id}"),
+        &format!("garbage collector for tenant {tenant_id}"),
        false,
        {
            let tenant = Arc::clone(tenant);
@@ -115,7 +127,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                gc_loop(tenant, cancel)
-                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -21,6 +21,7 @@ use crate::{
        },
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
+    InitializationOrder,
 };

 use super::{Timeline, TimelineResources};
@@ -43,7 +44,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // Shut down the layer flush task before the remote client, as one depends on the other
    task_mgr::shutdown_tasks(
        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id),
+        Some(timeline.tenant_shard_id.tenant_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -71,7 +72,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    info!("waiting for timeline tasks to shutdown");
    task_mgr::shutdown_tasks(
        None,
-        Some(timeline.tenant_shard_id),
+        Some(timeline.tenant_shard_id.tenant_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -406,6 +407,7 @@ impl DeleteTimelineFlow {
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
+        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -418,6 +420,7 @@ impl DeleteTimelineFlow {
                    remote_client,
                    deletion_queue_client,
                },
+                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
@@ -528,7 +531,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            Some(tenant_shard_id.tenant_id),
            Some(timeline_id),
            "timeline_delete",
            false,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::BackgroundLoopKind,
+        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            Some(self.tenant_shard_id.tenant_id),
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
@@ -158,15 +158,15 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
            BackgroundLoopKind::Eviction,
            ctx,
-        );
-
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
        };

        // If we evict layers but keep cached values derived from those layers, then
@@ -212,21 +212,11 @@ impl Timeline {
        // Gather layers for eviction.
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
-
        // So, we just need to deal with this.
-
-        let remote_client = match self.remote_client.as_ref() {
-            Some(c) => c,
-            None => {
-                error!("no remote storage configured, cannot evict layers");
-                return ControlFlow::Continue(());
-            }
-        };
-
-        let mut js = tokio::task::JoinSet::new();
-        {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
+            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);

@@ -272,49 +262,54 @@ impl Timeline {
                        continue;
                    }
                };
-                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    let remote_client = remote_client.clone();
-                    // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
-                    stats.candidates += 1;
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
+            candidates
+        };
+        stats.candidates = candidates.len();
+
+        let remote_client = match self.remote_client.as_ref() {
+            None => {
+                error!(
+                    num_candidates = candidates.len(),
+                    "no remote storage configured, cannot evict layers"
+                );
+                return ControlFlow::Continue(());
+            }
+            Some(c) => c,
        };

-        let join_all = async move {
-            while let Some(next) = js.join_next().await {
-                match next {
-                    Ok(Ok(())) => stats.evicted += 1,
-                    Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                        stats.not_evictable += 1;
-                    }
-                    Err(je) if je.is_cancelled() => unreachable!("not used"),
-                    Err(je) if je.is_panic() => {
-                        /* already logged */
-                        stats.errors += 1;
-                    }
-                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-                }
+        let results = match self.evict_layer_batch(remote_client, &candidates).await {
+            Err(pre_err) => {
+                stats.errors += candidates.len();
+                error!("could not do any evictions: {pre_err:#}");
+                return ControlFlow::Continue(());
            }
-            stats
+            Ok(results) => results,
        };
-
-        tokio::select! {
-            stats = join_all => {
-                if stats.candidates == stats.not_evictable {
-                    debug!(stats=?stats, "eviction iteration complete");
-                } else if stats.errors > 0 || stats.not_evictable > 0 {
-                    warn!(stats=?stats, "eviction iteration complete");
-                } else {
-                    info!(stats=?stats, "eviction iteration complete");
+        assert_eq!(results.len(), candidates.len());
+        for result in results {
+            match result {
+                None => {
+                    stats.skipped_for_shutdown += 1;
+                }
+                Some(Ok(())) => {
+                    stats.evicted += 1;
+                }
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                    stats.not_evictable += 1;
                }
-            }
-            _ = cancel.cancelled() => {
-                // just drop the joinset to "abort"
            }
        }
-
+        if stats.candidates == stats.not_evictable {
+            debug!(stats=?stats, "eviction iteration complete");
+        } else if stats.errors > 0 || stats.not_evictable > 0 {
+            warn!(stats=?stats, "eviction iteration complete");
+        } else {
+            info!(stats=?stats, "eviction iteration complete");
+        }
        ControlFlow::Continue(())
    }

@@ -348,7 +343,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -243,7 +243,7 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.delete_on_drop();
+        layer.garbage_collect_on_drop();
    }

    pub(crate) fn contains(&self, layer: &Layer) -> bool {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,10 +1,11 @@
 use anyhow::Context;
-
 use once_cell::sync::OnceCell;
-use tokio_util::sync::CancellationToken;
+
+use tokio::sync::Semaphore;
 use utils::lsn::Lsn;

 use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
+use std::sync::Arc;

 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -27,15 +28,8 @@ pub(super) struct LogicalSize {
        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,

-    /// Cancellation for the best-effort logical size calculation.
-    ///
-    /// The token is kept in a once-cell so that we can error out if a higher priority
-    /// request comes in *before* we have started the normal logical size calculation.
-    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
-        OnceCell<CancellationToken>,
-
-    /// Once the initial logical size is initialized, this is notified.
-    pub(crate) initialized: tokio::sync::Notify,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    pub initial_size_computation: Arc<tokio::sync::Semaphore>,

    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,
@@ -78,7 +72,7 @@ pub(crate) enum CurrentLogicalSize {
    Exact(Exact),
 }

-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone)]
 pub(crate) enum Accuracy {
    Approximate,
    Exact,
@@ -121,25 +115,24 @@ impl LogicalSize {
        Self {
            initial_logical_size: OnceCell::with_value((0, {
                crate::metrics::initial_logical_size::START_CALCULATION
-                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
+                    .first(None)
                    .calculation_result_saved()
            })),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
-            initialized: tokio::sync::Notify::new(),
        }
    }

    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
        Self {
            initial_logical_size: OnceCell::new(),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
-            initialized: tokio::sync::Notify::new(),
        }
    }

--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -19,14 +19,14 @@ use super::Timeline;
 pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
 }

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
    ) -> Self {
        Self {
            owning_tenant,
@@ -169,55 +169,18 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
 ///
 /// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
 #[must_use]
-pub(crate) struct TimelineUninitMark<'t> {
-    owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
+pub(crate) struct TimelineUninitMark {
    uninit_mark_deleted: bool,
    uninit_mark_path: Utf8PathBuf,
    pub(crate) timeline_path: Utf8PathBuf,
 }

-/// Errors when acquiring exclusive access to a timeline ID for creation
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum TimelineExclusionError {
-    #[error("Already exists")]
-    AlreadyExists(Arc<Timeline>),
-    #[error("Already creating")]
-    AlreadyCreating,
-
-    // e.g. I/O errors, or some failure deep in postgres initdb
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl<'t> TimelineUninitMark<'t> {
-    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
-        timeline_id: TimelineId,
-        uninit_mark_path: Utf8PathBuf,
-        timeline_path: Utf8PathBuf,
-    ) -> Result<Self, TimelineExclusionError> {
-        // Lock order: this is the only place we take both locks.  During drop() we only
-        // lock creating_timelines
-        let timelines = owning_tenant.timelines.lock().unwrap();
-        let mut creating_timelines: std::sync::MutexGuard<
-            '_,
-            std::collections::HashSet<TimelineId>,
-        > = owning_tenant.timelines_creating.lock().unwrap();
-
-        if let Some(existing) = timelines.get(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
-        } else if creating_timelines.contains(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyCreating)
-        } else {
-            creating_timelines.insert(timeline_id);
-            Ok(Self {
-                owning_tenant,
-                timeline_id,
-                uninit_mark_deleted: false,
-                uninit_mark_path,
-                timeline_path,
-            })
+impl TimelineUninitMark {
+    pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
+        Self {
+            uninit_mark_deleted: false,
+            uninit_mark_path,
+            timeline_path,
        }
    }

@@ -244,7 +207,7 @@ impl<'t> TimelineUninitMark<'t> {
    }
 }

-impl Drop for TimelineUninitMark<'_> {
+impl Drop for TimelineUninitMark {
    fn drop(&mut self) {
        if !self.uninit_mark_deleted {
            if self.timeline_path.exists() {
@@ -263,11 +226,5 @@ impl Drop for TimelineUninitMark<'_> {
                }
            }
        }
-
-        self.owning_tenant
-            .timelines_creating
-            .lock()
-            .unwrap()
-            .remove(&self.timeline_id);
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -30,7 +30,6 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

-use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
@@ -42,7 +41,7 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use utils::id::TimelineId;
+use utils::id::TenantTimelineId;

 use self::connection_manager::ConnectionManagerStatus;

@@ -61,8 +60,7 @@ pub struct WalReceiverConf {
 }

 pub struct WalReceiver {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
+    timeline: TenantTimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }

@@ -73,7 +71,7 @@ impl WalReceiver {
        mut broker_client: BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Self {
-        let tenant_shard_id = timeline.tenant_shard_id;
+        let tenant_id = timeline.tenant_shard_id.tenant_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -83,9 +81,9 @@ impl WalReceiver {
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
-            Some(timeline.tenant_shard_id),
+            Some(tenant_id),
            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
+            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
            false,
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
@@ -119,12 +117,11 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
        );

        Self {
-            tenant_shard_id,
-            timeline_id,
+            timeline: TenantTimelineId::new(tenant_id, timeline_id),
            manager_status,
        }
    }
@@ -132,8 +129,8 @@ impl WalReceiver {
    pub async fn stop(self) {
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
+            Some(self.timeline.tenant_id),
+            Some(self.timeline.timeline_id),
        )
        .await;
    }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
    task_mgr::spawn(
        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id),
+        Some(timeline.tenant_shard_id.tenant_id),
        Some(timeline.timeline_id),
        "walreceiver connection",
        false,
@@ -397,10 +397,7 @@ pub(super) async fn handle_walreceiver_connection(
            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
+                .get_current_logical_size(&ctx)
                // FIXME: https://github.com/neondatabase/neon/issues/5963
                .size_dont_care_about_accuracy();
            let status_update = PageserverFeedback {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -288,9 +288,6 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

-        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
-        // where our caller doesn't get to use the returned VirtualFile before its
-        // slot gets re-used by someone else.
        let file = STORAGE_IO_TIME_METRIC
            .get(StorageIoOperation::Open)
            .observe_closure_duration(|| open_options.open(path))?;
@@ -314,9 +311,6 @@ impl VirtualFile {
            timeline_id,
        };

-        // TODO: Under pressure, it's likely the slot will get re-used and
-        // the underlying file closed before they get around to using it.
-        // => https://github.com/neondatabase/neon/issues/6065
        slot_guard.file.replace(file);

        Ok(vfile)
@@ -427,12 +421,9 @@ impl VirtualFile {
        // now locked in write-mode. Find a free slot to put it in.
        let (handle, mut slot_guard) = open_files.find_victim_slot();

-        // Re-open the physical file.
-        // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
-        // case from StorageIoOperation::Open. This helps with identifying thrashing
-        // of the virtual file descriptor cache.
+        // Open the physical file
        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::OpenAfterReplace)
+            .get(StorageIoOperation::Open)
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
@@ -619,11 +610,9 @@ impl Drop for VirtualFile {
            slot.recently_used.store(false, Ordering::Relaxed);
            // there is also operation "close-by-replace" for closes done on eviction for
            // comparison.
-            if let Some(fd) = slot_guard.file.take() {
-                STORAGE_IO_TIME_METRIC
-                    .get(StorageIoOperation::Close)
-                    .observe_closure_duration(|| drop(fd));
-            }
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
        }
    }
 }
@@ -654,7 +643,6 @@ pub fn init(num_slots: usize) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
-    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

 const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,6 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

-use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -31,7 +30,6 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

 use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -48,7 +46,6 @@ use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

 pub struct WalIngest<'a> {
-    shard: ShardIdentity,
    timeline: &'a Timeline,

    checkpoint: CheckPoint,
@@ -68,7 +65,6 @@ impl<'a> WalIngest<'a> {
        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);

        Ok(WalIngest {
-            shard: *timeline.get_shard_identity(),
            timeline,
            checkpoint,
            checkpoint_modified: false,
@@ -91,8 +87,6 @@ impl<'a> WalIngest<'a> {
        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        WAL_INGEST.records_received.inc();
-
        modification.lsn = lsn;
        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;

@@ -361,33 +355,6 @@ impl<'a> WalIngest<'a> {
        // Iterate through all the blocks that the record modifies, and
        // "put" a separate copy of the record for each block.
        for blk in decoded.blocks.iter() {
-            let rel = RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum,
-            };
-
-            let key = rel_block_to_key(rel, blk.blkno);
-            let key_is_local = self.shard.is_key_local(&key);
-
-            tracing::debug!(
-                lsn=%lsn,
-                key=%key,
-                "ingest: shard decision {} (checkpoint={})",
-                if !key_is_local { "drop" } else { "keep" },
-                self.checkpoint_modified
-            );
-
-            if !key_is_local {
-                if self.shard.is_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    self.observe_decoded_block(modification, blk, ctx).await?;
-                }
-
-                continue;
-            }
            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                .await?;
        }
@@ -400,38 +367,13 @@ impl<'a> WalIngest<'a> {
            self.checkpoint_modified = false;
        }

-        if modification.is_empty() {
-            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-            WAL_INGEST.records_filtered.inc();
-            modification.tline.finish_write(lsn);
-        } else {
-            WAL_INGEST.records_committed.inc();
-            modification.commit(ctx).await?;
-        }
-
        // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN.
+        // checkpoint data, let the repository know that it is up-to-date to this LSN
+        modification.commit(ctx).await?;

        Ok(())
    }

-    /// Do not store this block, but observe it for the purposes of updating our relation size state.
-    async fn observe_decoded_block(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        blk: &DecodedBkpBlock,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let rel = RelTag {
-            spcnode: blk.rnode_spcnode,
-            dbnode: blk.rnode_dbnode,
-            relnode: blk.rnode_relnode,
-            forknum: blk.forknum,
-        };
-        self.handle_rel_extend(modification, rel, blk.blkno, ctx)
-            .await
-    }
-
    async fn ingest_decoded_block(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -458,10 +400,8 @@ impl<'a> WalIngest<'a> {
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
            && (decoded.xl_info == pg_constants::XLOG_FPI
                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-            // compression of WAL is not yet supported: fall back to storing the original WAL record
+        // compression of WAL is not yet supported: fall back to storing the original WAL record
            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
-            // do not materialize null pages because them most likely be soon replaced with real data
-            && blk.bimg_len != 0
        {
            // Extract page image from FPI record
            let img_len = blk.bimg_len as usize;
@@ -1525,15 +1465,8 @@ impl<'a> WalIngest<'a> {
            //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
            modification.put_rel_extend(rel, new_nblocks, ctx).await?;

-            let mut key = rel_block_to_key(rel, blknum);
            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
-                key.field6 = gap_blknum;
-
-                if self.shard.get_shard_number(&key) != self.shard.number {
-                    continue;
-                }
-
                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
            }
        }
@@ -2191,7 +2124,7 @@ mod tests {
            .load()
            .await;
        let tline = tenant
-            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
            .await
            .unwrap();

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

@@ -123,9 +124,7 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
+    /// CANCEL SAFETY: NOT CANCEL SAFE.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -158,6 +157,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
+                    .await
                };
                img = Some(result?);

@@ -178,6 +178,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
+            .await
        }
    }
 }
@@ -215,7 +216,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -331,7 +332,12 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
+                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
+                wait_done
+                    .wait_for(|v| *v)
+                    .await
+                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -643,6 +649,8 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
+    stderr_logger_cancel: CancellationToken,
+    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -691,8 +699,6 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -704,45 +710,69 @@ impl WalRedoProcess {
        }
        set_nonblock_or_log_err!(stdin)?;
        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
+
+        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;

        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

-        tokio::spawn(
+        let stderr_logger_cancel = CancellationToken::new();
+        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
+            tokio::sync::watch::channel(false);
+        tokio::spawn({
+            let stderr_logger_cancel = stderr_logger_cancel.clone();
            async move {
                scopeguard::defer! {
                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                    let _ = stderr_logger_task_done_tx.send(true);
                }
                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
+                loop {
+                    // NB: we purposefully don't do a select! for the cancellation here.
+                    // The cancellation would likely cause us to miss stderr messages.
+                    // We can rely on this to return from .await because when we SIGKILL
+                    // the child, the writing end of the stderr pipe gets closed.
+                    match stderr.readable_mut().await {
+                        Ok(mut guard) => {
+                            let mut errbuf = [0; 16384];
+                            let res = guard.try_io(|fd| {
+                                use std::io::Read;
+                                fd.get_mut().read(&mut errbuf)
+                            });
+                            match res {
+                                Ok(Ok(0)) => {
+                                    // it closed the stderr pipe
+                                    break;
+                                }
+                                Ok(Ok(n)) => {
+                                    // The message might not be split correctly into lines here. But this is
+                                    // good enough, the important thing is to get the message to the log.
+                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
+                                    error!(output, "received output");
+                                },
+                                Ok(Err(e)) => {
+                                    error!(error = ?e, "read() error, waiting for cancellation");
+                                    stderr_logger_cancel.cancelled().await;
+                                    error!(error = ?e, "read() error, cancellation complete");
+                                    break;
+                                }
+                                Err(e) => {
+                                    let _e: tokio::io::unix::TryIoError = e;
+                                    // the read() returned WouldBlock, that's expected
+                                }
+                            }
                        }
                        Err(e) => {
-                            break Err(e);
+                            error!(error = ?e, "read() error, waiting for cancellation");
+                            stderr_logger_cancel.cancelled().await;
+                            error!(error = ?e, "read() error, cancellation complete");
+                            break;
                        }
                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
                }
            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
-        );
+        });

        Ok(Self {
            conf,
@@ -757,6 +787,8 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
+            stderr_logger_cancel,
+            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -997,6 +1029,7 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -41,17 +41,6 @@ libwalproposer.a: $(WALPROP_OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^

-# needs vars:
-# FIND_TYPEDEF pointing to find_typedef
-# INDENT pointing to pg_bsd_indent
-# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
-#   pgindent will pick it up as pg_bsd_indent path).
-.PHONY: pgindent
-pgindent:
-	+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
-	$(FIND_TYPEDEF) . > neon.typedefs
-	INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
-
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;

 /* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
+static CURL * CurlHandle;
 static struct curl_slist *ContentHeader = NULL;

 /*
@@ -54,7 +54,7 @@ typedef enum
 {
 	Op_Set,						/* An upsert: Either a creation or an alter */
 	Op_Delete,
-} OpType;
+}			OpType;

 typedef struct
 {
@@ -62,7 +62,7 @@ typedef struct
 	Oid			owner;
 	char		old_name[NAMEDATALEN];
 	OpType		type;
-} DbEntry;
+}			DbEntry;

 typedef struct
 {
@@ -70,7 +70,7 @@ typedef struct
 	char		old_name[NAMEDATALEN];
 	const char *password;
 	OpType		type;
-} RoleEntry;
+}			RoleEntry;

 /*
 * We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
 	struct DdlHashTable *prev_table;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
-} DdlHashTable;
+}			DdlHashTable;

 static DdlHashTable RootTable;
-static DdlHashTable *CurrentDdlTable = &RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;

 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
 {
 	char		str[ERROR_SIZE];
 	size_t		size;
-} ErrorString;
+}			ErrorString;

 static size_t
 ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
@@ -478,7 +478,7 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
-	return strcmp(role_name, "neon_superuser") == 0;
+    return strcmp(role_name, "neon_superuser") == 0;
 }

 static void
@@ -509,7 +509,6 @@ HandleCreateDb(CreatedbStmt *stmt)
 	if (downer && downer->arg)
 	{
 		const char *owner_name = defGetString(downer);
-
 		if (RoleIsNeonSuperuser(owner_name))
 			elog(ERROR, "can't create a database with owner neon_superuser");
 		entry->owner = get_role_oid(owner_name, false);
@@ -537,7 +536,6 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 	const char *new_owner = get_rolespec_name(stmt->newowner);
-
 	if (RoleIsNeonSuperuser(new_owner))
 		elog(ERROR, "can't alter owner to neon_superuser");
 	entry->owner = get_role_oid(new_owner, false);
@@ -635,7 +633,6 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	DefElem    *dpass = NULL;
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;
-
 	if (RoleIsNeonSuperuser(role_name))
 		elog(ERROR, "can't ALTER neon_superuser");

--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -25,81 +25,79 @@

 #include <curl/curl.h>

-static int	extension_server_port = 0;
+static int extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;

-/*
-  * to download all SQL (and data) files for an extension:
-  * curl -X POST http://localhost:8080/extension_server/postgis
-  * it covers two possible extension files layouts:
-  * 1. extension_name--version--platform.sql
-  * 2. extension_name/extension_name--version.sql
-  *    extension_name/extra_files.csv
-  * to download specific library file:
-  * curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
-  */
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
-	CURLcode	res;
-	char	   *compute_ctl_url;
-	char	   *postdata;
-	bool		ret = false;
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;

-	if ((curl = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }

-	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-							   extension_server_port, filename, is_library ? "?is_library=true" : "");
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");

-	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);

-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);

-	if (curl)
-	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }

-		/* always cleanup */
-		curl_easy_cleanup(curl);
-	}
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }

-	return ret;
+    return ret;
 }

-void
-pg_init_extension_server()
+void pg_init_extension_server()
 {
-	/* Port to connect to compute_ctl on localhost */
-	/* to request extension files. */
-	DefineCustomIntVariable("neon.extension_server_port",
-							"connection string to the compute_ctl",
-							NULL,
-							&extension_server_port,
-							0, 0, INT_MAX,
-							PGC_POSTMASTER,
-							0,	/* no flags required */
-							NULL, NULL, NULL);
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);

-	/* set download_extension_file_hook */
-	prev_download_extension_file_hook = download_extension_file_hook;
-	download_extension_file_hook = neon_download_extension_file_http;
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
 }
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -67,34 +67,32 @@
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
-	uint32		hash;
+	uint32      hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
 {
-	uint64		generation;		/* generation is needed to handle correct hash
-								 * reenabling */
-	uint32		size;			/* size of cache file in chunks */
-	uint32		used;			/* number of used chunks */
-	uint32		limit;			/* shared copy of lfc_size_limit */
-	uint64		hits;
-	uint64		misses;
-	uint64		writes;
-	dlist_head	lru;			/* double linked list for LRU replacement
-								 * algorithm */
+	uint64 generation; /* generation is needed to handle correct hash reenabling */
+	uint32 size; /* size of cache file in chunks */
+	uint32 used; /* number of used chunks */
+	uint32 limit; /* shared copy of lfc_size_limit */
+	uint64 hits;
+	uint64 misses;
+	uint64 writes;
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
 } FileCacheControl;

-static HTAB *lfc_hash;
-static int	lfc_desc = 0;
+static HTAB* lfc_hash;
+static int   lfc_desc = 0;
 static LWLockId lfc_lock;
-static int	lfc_max_size;
-static int	lfc_size_limit;
-static char *lfc_path;
-static FileCacheControl *lfc_ctl;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
@@ -102,7 +100,7 @@ static shmem_request_hook_type prev_shmem_request_hook;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

-void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
+void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);

 /*
 * Local file cache is optional and Neon can work without it.
@@ -111,10 +109,9 @@ void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
 * All cache content should be invalidated to avoid reading of stale or corrupted data
 */
 static void
-lfc_disable(char const *op)
+lfc_disable(char const* op)
 {
-	int			fd;
-
+	int fd;
 	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

 	/* Invalidate hash */
@@ -123,7 +120,7 @@ lfc_disable(char const *op)
 	if (LFC_ENABLED())
 	{
 		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
+		FileCacheEntry* entry;

 		hash_seq_init(&status, lfc_hash);
 		while ((entry = hash_seq_search(&status)) != NULL)
@@ -138,24 +135,16 @@ lfc_disable(char const *op)

 		if (lfc_desc > 0)
 		{
-			/*
-			 * If the reason of error is ENOSPC, then truncation of file may
-			 * help to reclaim some space
-			 */
-			int			rc = ftruncate(lfc_desc, 0);
-
+			/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
+			int rc = ftruncate(lfc_desc, 0);
 			if (rc < 0)
 				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
 		}
 	}
-
-	/*
-	 * We need to use unlink to to avoid races in LFC write, because it is not
-	 * protectedby
-	 */
+	/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
 	unlink(lfc_path);

-	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
+	fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
 	if (fd < 0)
 		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
 	else
@@ -181,15 +170,13 @@ lfc_maybe_disabled(void)
 static bool
 lfc_ensure_opened(void)
 {
-	bool		enabled = !lfc_maybe_disabled();
-
+	bool enabled = !lfc_maybe_disabled();
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0 && enabled)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR);

-		if (lfc_desc < 0)
-		{
+		if (lfc_desc < 0) {
 			lfc_disable("open");
 			return false;
 		}
@@ -200,7 +187,7 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
-	bool		found;
+	bool found;
 	static HASHCTL info;

 	if (prev_shmem_startup_hook)
@@ -210,22 +197,17 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
 	{
-		int			fd;
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
-
-		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
+		int fd;
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
-
-		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
-		 * of victim
-		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 lfc_size + 1, lfc_size + 1,
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -237,7 +219,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);

 		/* Recreate file cache on restart */
-		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
+		fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
 		if (fd < 0)
 		{
 			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
@@ -260,7 +242,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }

@@ -268,11 +250,9 @@ static bool
 is_normal_backend(void)
 {
 	/*
-	 * Stats collector detach shared memory, so we should not try to access
-	 * shared memory here. Parallel workers first assign default value (0), so
-	 * not perform truncation in parallel workers. The Postmaster can handle
-	 * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
-	 * but has no PGPROC.
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }
@@ -291,7 +271,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 static void
 lfc_change_limit_hook(int newval, void *extra)
 {
-	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);

 	if (!is_normal_backend())
 		return;
@@ -303,15 +283,11 @@ lfc_change_limit_hook(int newval, void *extra)

 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
-		/*
-		 * Shrink cache by throwing away least recently accessed chunks and
-		 * returning their space to file system
-		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
 			elog(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -338,7 +314,7 @@ lfc_init(void)
 							"Maximal size of Neon local file cache",
 							NULL,
 							&lfc_max_size,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
@@ -351,7 +327,7 @@ lfc_init(void)
 							"Current limit for size of Neon local file cache",
 							NULL,
 							&lfc_size_limit,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_SIGHUP,
@@ -391,18 +367,18 @@ lfc_init(void)
 bool
 lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		found = false;
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found = false;
+	uint32 hash;

-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
 		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -421,13 +397,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 void
 lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;

-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
 		return;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -462,10 +438,9 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	 */
 	if (entry->bitmap[chunk_offs >> 5] == 0)
 	{
-		bool		has_remaining_pages;
+		bool has_remaining_pages;

-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
-		{
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
 			if (entry->bitmap[i] != 0)
 			{
 				has_remaining_pages = true;
@@ -474,8 +449,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		}

 		/*
-		 * Put the entry at the position that is first to be reclaimed when we
-		 * have no cached pages remaining in the chunk
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
 		 */
 		if (!has_remaining_pages)
 		{
@@ -501,16 +476,16 @@ bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		result = true;
-	uint32		hash;
-	uint64		generation;
-	uint32		entry_offset;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+	uint64 generation;
+	uint32 entry_offset;

-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
 		return false;

 	if (!lfc_ensure_opened())
@@ -518,7 +493,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -545,7 +520,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("read");
@@ -576,29 +551,30 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
+		  char *buffer)
 #else
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
+		  const void *buffer)
 #endif
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
-	uint64		generation;
-	uint32		entry_offset;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+	uint64 generation;
+	uint32 entry_offset;

-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
 		return;

 	if (!lfc_ensure_opened())
 		return;

 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	hash = get_hash_value(lfc_hash, &tag);

@@ -614,36 +590,24 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void

 	if (found)
 	{
-		/*
-		 * Unlink entry from LRU list to pin it for the duration of IO
-		 * operation
-		 */
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
 		if (entry->access_count++ == 0)
 			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
 		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations):
-		 *
-		 * 1) Wait until some of this operation is completed and pages is
-		 * unpinned.
-		 *
-		 * 2) Allocate one more chunk, so that specified cache size is more
-		 * recommendation than hard limit.
-		 *
-		 * As far as probability of such event (that all pages are pinned) is
-		 * considered to be very very small: there are should be very large
-		 * number of concurrent IO operations and them are limited by
-		 * max_connections, we prefer not to complicate code and use second
-		 * approach.
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
+		 * we prefer not to complicate code and use second approach.
 		 */
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -652,8 +616,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		else
 		{
 			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-												 * of file */
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
 		}
 		entry->access_count = 1;
 		entry->hash = hash;
@@ -665,7 +628,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	lfc_ctl->writes += 1;
 	LWLockRelease(lfc_lock);

-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("write");
@@ -702,13 +665,13 @@ Datum
 neon_get_lfc_stats(PG_FUNCTION_ARGS)
 {
 	FuncCallContext *funcctx;
-	NeonGetStatsCtx *fctx;
+	NeonGetStatsCtx* fctx;
 	MemoryContext oldcontext;
 	TupleDesc	tupledesc;
 	Datum		result;
 	HeapTuple	tuple;
-	char const *key;
-	uint64		value;
+	char const* key;
+	uint64      value;
 	Datum		values[NUM_NEON_GET_STATS_COLS];
 	bool		nulls[NUM_NEON_GET_STATS_COLS];

@@ -720,7 +683,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

 		/* Create a user function context for cross-call persistence */
-		fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
+		fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));

 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
@@ -741,7 +704,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 	funcctx = SRF_PERCALL_SETUP();

 	/* Get the saved state */
-	fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
+	fctx = (NeonGetStatsCtx*) funcctx->user_fctx;

 	switch (funcctx->call_cntr)
 	{
@@ -829,9 +792,9 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
-		uint32		n_pages = 0;
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;

 		funcctx = SRF_FIRSTCALL_INIT();

@@ -888,7 +851,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
+					for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
@@ -907,11 +870,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		if (n_pages != 0)
 		{
 			/*
-			 * Scan through all the cache entries, saving the relevant fields
-			 * in the fctx->record structure.
+			 * Scan through all the cache entries, saving the relevant fields in the
+			 * fctx->record structure.
 			 */
-			uint32		n = 0;
-
+			uint32 n = 0;
 			hash_seq_init(&status, lfc_hash);
 			while ((entry = hash_seq_search(&status)) != NULL)
 			{
@@ -919,7 +881,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				{
 					if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 					{
-						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+						fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -69,9 +69,9 @@ int			max_reconnect_attempts = 60;

 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+    LWLockId lock;
+    pg_atomic_uint64 update_counter;
+    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;

 #if PG_VERSION_NUM >= 150000
@@ -83,7 +83,7 @@ static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];

-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
@@ -91,43 +91,43 @@ static void pageserver_disconnect(void);
 static bool
 PagestoreShmemIsValid()
 {
-	return pagestore_shared && UsedShmemSegAddr;
+    return pagestore_shared && UsedShmemSegAddr;
 }

 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
 }

 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
+    if(!PagestoreShmemIsValid())
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
+    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
 CheckConnstringUpdated()
 {
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+    if(!PagestoreShmemIsValid())
+        return false;
+    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
 }

 static void
 ReloadConnstring()
 {
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
+    if(!PagestoreShmemIsValid())
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -141,20 +141,21 @@ pageserver_connect(int elevel)

 	Assert(!connected);

-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+        if(CheckConnstringUpdated())
+        {
+            ReloadConnstring();
+        }

 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
 	 * variable was set, use that as the password.
 	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
 	 */
 	n = 0;
 	if (neon_auth_token)
@@ -197,9 +198,9 @@ pageserver_connect(int elevel)

 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
 	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
+			  MyLatch, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
+			  NULL, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);

 	while (PQisBusy(pageserver_conn))
@@ -264,7 +265,6 @@ retry:
 			if (!PQconsumeInput(pageserver_conn))
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 				neon_log(LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
@@ -305,15 +305,15 @@ pageserver_disconnect(void)
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+        if(CheckConnstringUpdated())
+        {
+            pageserver_disconnect();
+            ReloadConnstring();
+        }

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -326,12 +326,10 @@ pageserver_send(NeonRequest *request)

 	/*
 	 * If pageserver is stopped, the connections from compute node are broken.
-	 * The compute node doesn't notice that immediately, but it will cause the
-	 * next request to fail, usually on the next query. That causes
-	 * user-visible errors if pageserver is restarted, or the tenant is moved
-	 * from one pageserver to another. See
-	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
-	 * connection in case of failure.
+	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
+	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
+	 * See https://github.com/neondatabase/neon/issues/1138
+	 * So try to reestablish connection in case of failure.
 	 */
 	if (!connected)
 	{
@@ -355,7 +353,6 @@ pageserver_send(NeonRequest *request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 		pageserver_disconnect();
 		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
@@ -413,8 +410,7 @@ pageserver_receive(void)
 		}
 		else if (rc == -2)
 		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
+			char* msg = pchomp(PQerrorMessage(pageserver_conn));
 			pageserver_disconnect();
 			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
@@ -448,7 +444,6 @@ pageserver_flush(void)
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 			pageserver_disconnect();
 			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
@@ -476,47 +471,46 @@ check_neon_id(char **newval, void **extra, GucSource source)
 static Size
 PagestoreShmemSize(void)
 {
-	return sizeof(PagestoreShmemState);
+    return sizeof(PagestoreShmemState);
 }

 static bool
 PagestoreShmemInit(void)
 {
-	bool		found;
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	pagestore_shared = ShmemInitStruct("libpagestore shared state",
-									   PagestoreShmemSize(),
-									   &found);
-	if (!found)
-	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-		AssignPageserverConnstring(page_server_connstring, NULL);
-	}
-	LWLockRelease(AddinShmemInitLock);
-	return found;
+    bool found;
+    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+    pagestore_shared = ShmemInitStruct("libpagestore shared state",
+                                       PagestoreShmemSize(),
+                                       &found);
+    if(!found)
+    {
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+        AssignPageserverConnstring(page_server_connstring, NULL);
+    }
+    LWLockRelease(AddinShmemInitLock);
+    return found;
 }

 static void
 pagestore_shmem_startup_hook(void)
 {
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
+    if(prev_shmem_startup_hook)
+        prev_shmem_startup_hook();

-	PagestoreShmemInit();
+    PagestoreShmemInit();
 }

 static void
 pagestore_shmem_request(void)
 {
 #if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
+    if(prev_shmem_request_hook)
+        prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
+    RequestAddinShmemSpace(PagestoreShmemSize());
+    RequestNamedLWLockTranche("neon_libpagestore", 1);
 }

 static void
@@ -526,7 +520,7 @@ pagestore_prepare_shmem(void)
 	prev_shmem_request_hook = shmem_request_hook;
 	shmem_request_hook = pagestore_shmem_request;
 #else
-	pagestore_shmem_request();
+        pagestore_shmem_request();
 #endif
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = pagestore_shmem_startup_hook;
@@ -538,7 +532,7 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-	pagestore_prepare_shmem();
+        pagestore_prepare_shmem();

 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
@@ -613,10 +607,7 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;

-	/*
-	 * Retrieve the auth token to use when connecting to pageserver and
-	 * safekeepers
-	 */
+	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
 	neon_auth_token = getenv("NEON_AUTH_TOKEN");
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,11 +48,9 @@ _PG_init(void)

 	pg_init_extension_server();

-	/*
-	 * Important: This must happen after other parts of the extension are
-	 * loaded, otherwise any settings to GUCs that were set before the
-	 * extension was loaded will be removed.
-	 */
+	// Important: This must happen after other parts of the extension
+	// are loaded, otherwise any settings to GUCs that were set before
+	// the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
 * block_id; false otherwise.
 */
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -59,7 +59,7 @@

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers

-#else							/* major version >= 16 */
+#else /* major version >= 16 */

 #define USE_RELFILELOCATOR

@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#endif							/* NEON_PGVERSIONCOMPAT_H */
+#endif //NEON_PGVERSIONCOMPAT_H
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -40,13 +40,13 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
-} NeonMessageTag;
+}			NeonMessageTag;

 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonMessage;
+}			NeonMessage;

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

@@ -67,27 +67,27 @@ typedef struct
 	NeonMessageTag tag;
 	bool		latest;			/* if true, request latest page version */
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} NeonRequest;
+}			NeonRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonExistsRequest;
+}			NeonExistsRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonNblocksRequest;
+}			NeonNblocksRequest;

 typedef struct
 {
 	NeonRequest req;
 	Oid			dbNode;
-} NeonDbSizeRequest;
+}			NeonDbSizeRequest;

 typedef struct
 {
@@ -95,31 +95,31 @@ typedef struct
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-} NeonGetPageRequest;
+}			NeonGetPageRequest;

 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonResponse;
+}			NeonResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	bool		exists;
-} NeonExistsResponse;
+}			NeonExistsResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	uint32		n_blocks;
-} NeonNblocksResponse;
+}			NeonNblocksResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
-} NeonGetPageResponse;
+}			NeonGetPageResponse;

 #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))

@@ -127,18 +127,18 @@ typedef struct
 {
 	NeonMessageTag tag;
 	int64		db_size;
-} NeonDbSizeResponse;
+}			NeonDbSizeResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
-} NeonErrorResponse;
+}			NeonErrorResponse;

-extern StringInfoData nm_pack_request(NeonRequest *msg);
-extern NeonResponse *nm_unpack_response(StringInfo s);
-extern char *nm_to_string(NeonMessage *msg);
+extern StringInfoData nm_pack_request(NeonRequest * msg);
+extern NeonResponse * nm_unpack_response(StringInfo s);
+extern char *nm_to_string(NeonMessage * msg);

 /*
 * API
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage *msg);

 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
+	bool		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
 	bool		(*flush) (void);
-} page_server_api;
+}			page_server_api;

 extern void prefetch_on_ps_disconnect(void);

-extern page_server_api *page_server;
+extern page_server_api * page_server;

 extern char *page_server_connstring;
-extern int	flush_every_n_requests;
-extern int	readahead_buffer_size;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
+extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,7 +59,6 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
-#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -101,21 +100,21 @@ typedef enum
 	UNLOGGED_BUILD_PHASE_1,
 	UNLOGGED_BUILD_PHASE_2,
 	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
+}			UnloggedBuildPhase;

 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * Prefetch implementation:
- *
+ * 
 * Prefetch is performed locally by each backend.
 *
 * There can be up to readahead_buffer_size active IO requests registered at
 * any time. Requests using smgr_prefetch are sent to the pageserver, but we
 * don't wait on the response. Requests using smgr_read are either read from
 * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages.
+ * this also will allow us to receive other prefetched pages. 
 * Each request is immediately written to the output buffer of the pageserver
 * connection, but may not be flushed if smgr_prefetch is used: pageserver
 * flushes sent requests on manual flush, or every neon.flush_output_after
@@ -139,7 +138,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * State machine:
- *
+ *        
 * not in hash : in hash
 *             :
 * UNUSED ------> REQUESTED --> RECEIVED
@@ -150,34 +149,30 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 *   +----------------+------------+
 *             :
 */
-typedef enum PrefetchStatus
-{
-	PRFS_UNUSED = 0,			/* unused slot */
-	PRFS_REQUESTED,				/* request was written to the sendbuffer to
-								 * PS, but not necessarily flushed. all fields
-								 * except response valid */
-	PRFS_RECEIVED,				/* all fields valid */
-	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
-								 * valid */
+typedef enum PrefetchStatus {
+	PRFS_UNUSED = 0,	/* unused slot */
+	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
+						 * necessarily flushed.
+						 * all fields except response valid */
+	PRFS_RECEIVED,		/* all fields valid */
+	PRFS_TAG_REMAINS,	/* only buftag and my_ring_index are still valid */
 } PrefetchStatus;

-typedef struct PrefetchRequest
-{
-	BufferTag	buftag;			/* must be first entry in the struct */
+typedef struct PrefetchRequest {
+	BufferTag	buftag; /* must be first entry in the struct */
 	XLogRecPtr	effective_request_lsn;
 	XLogRecPtr	actual_request_lsn;
-	NeonResponse *response;		/* may be null */
+	NeonResponse *response; /* may be null */
 	PrefetchStatus status;
 	uint64		my_ring_index;
 } PrefetchRequest;

 /* prefetch buffer lookup hash table */

-typedef struct PrfHashEntry
-{
+typedef struct PrfHashEntry {
 	PrefetchRequest *slot;
-	uint32		status;
-	uint32		hash;
+	uint32 status;
+	uint32 hash;
 } PrfHashEntry;

 #define SH_PREFIX			prfh
@@ -201,42 +196,36 @@ typedef struct PrfHashEntry
 /*
 * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
 * It maintains a (ring) buffer of in-flight requests and responses.
- *
+ * 
 * We maintain several indexes into the ring buffer:
 * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- *
+ * 
 * ring_unused points to the first unused slot of the buffer
 * ring_receive is the next request that is to be received
 * ring_last is the oldest received entry in the buffer
- *
+ * 
 * Apart from being an entry in the ring buffer of prefetch requests, each
 * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
 */
-typedef struct PrefetchState
-{
-	MemoryContext bufctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext errctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext hashctx;		/* context for prf_buffer */
+typedef struct PrefetchState {
+	MemoryContext bufctx; /* context for prf_buffer[].response allocations */
+	MemoryContext errctx; /* context for prf_buffer[].response allocations */
+	MemoryContext hashctx; /* context for prf_buffer */

 	/* buffer indexes */
-	uint64		ring_unused;	/* first unused slot */
-	uint64		ring_flush;		/* next request to flush */
-	uint64		ring_receive;	/* next slot that is to receive a response */
-	uint64		ring_last;		/* min slot with a response value */
+	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
+	uint64	ring_receive;		/* next slot that is to receive a response */
+	uint64	ring_last;			/* min slot with a response value */

 	/* metrics / statistics  */
-	int			n_responses_buffered;	/* count of PS responses not yet in
-										 * buffers */
-	int			n_requests_inflight;	/* count of PS requests considered in
-										 * flight */
-	int			n_unused;		/* count of buffers < unused, > last, that are
-								 * also unused */
+	int		n_responses_buffered;	/* count of PS responses not yet in buffers */
+	int		n_requests_inflight;	/* count of PS requests considered in flight */
+	int		n_unused;				/* count of buffers < unused, > last, that are also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
-	PrefetchRequest prf_buffer[];	/* prefetch buffers */
+	prfh_hash *prf_hash;
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

 PrefetchState *MyPState;
@@ -274,10 +263,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 static bool
 compact_prefetch_buffers(void)
 {
-	uint64		empty_ring_index = MyPState->ring_last;
-	uint64		search_ring_index = MyPState->ring_receive;
-	int			n_moved = 0;
-
+	uint64	empty_ring_index = MyPState->ring_last;
+	uint64	search_ring_index = MyPState->ring_receive;
+	int n_moved = 0;
+	
 	if (MyPState->ring_receive == MyPState->ring_last)
 		return false;

@@ -292,14 +281,15 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Here we have established: slots < search_ring_index have an unknown
-	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
-	 * are unused slots > empty_ring_index are in use, or outside our buffer's
-	 * range. ... unless search_ring_index <= ring_last
-	 *
+	 * Here we have established:
+	 *   slots < search_ring_index have an unknown state (not scanned)
+	 *   slots >= search_ring_index and <= empty_ring_index are unused
+	 *   slots > empty_ring_index are in use, or outside our buffer's range.
+	 * ... unless search_ring_index <= ring_last
+	 * 
 	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as
-	 * we hit more unused items while moving backwards through the array.
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as we hit
+	 * more unused items while moving backwards through the array.
 	 */

 	while (search_ring_index > MyPState->ring_last)
@@ -339,10 +329,7 @@ compact_prefetch_buffers(void)

 		/* empty the moved slot */
 		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag)
-		{
-			0
-		};
+		source_slot->buftag = (BufferTag) {0};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->effective_request_lsn = 0;
@@ -352,8 +339,8 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Only when we've moved slots we can expect trailing unused slots, so
-	 * only then we clean up trailing unused slots.
+	 * Only when we've moved slots we can expect trailing unused slots,
+	 * so only then we clean up trailing unused slots.
 	 */
 	if (n_moved > 0)
 	{
@@ -370,9 +357,10 @@ readahead_buffer_resize(int newsize, void *extra)
 	uint64		end,
 				nfree = newsize;
 	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
-		(sizeof(PrefetchRequest) * newsize);
-
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * newsize
+	);
+	
 	/* don't try to re-initialize if we haven't initialized yet */
 	if (MyPState == NULL)
 		return;
@@ -399,12 +387,12 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;

-	/*
+	/* 
 	 * Copy over the prefetches.
-	 *
+	 * 
 	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one
-	 * iteration on the dataset, and trivial compaction.
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
 	 */
 	for (end = MyPState->ring_unused - 1;
 		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
@@ -412,7 +400,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
 		PrefetchRequest *newslot;
-		bool		found;
+		bool	found;

 		if (slot->status == PRFS_UNUSED)
 			continue;
@@ -475,11 +463,10 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup_trailing_unused(void)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest *slot;

-	while (MyPState->ring_last < MyPState->ring_receive)
-	{
+	while (MyPState->ring_last < MyPState->ring_receive) {
 		ring_index = MyPState->ring_last;
 		slot = GetPrfSlot(ring_index);

@@ -493,7 +480,7 @@ prefetch_cleanup_trailing_unused(void)
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
- *
+ * 
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
@@ -525,7 +512,7 @@ prefetch_wait_for(uint64 ring_index)

 /*
 * Read the response of a prefetch request into its slot.
- *
+ * 
 * The caller is responsible for making sure that the request for this buffer
 * was flushed to the PageServer.
 *
@@ -565,7 +552,7 @@ prefetch_read(PrefetchRequest *slot)

 /*
 * Disconnect hook - drop prefetches when the connection drops
- *
+ * 
 * If we don't remove the failed prefetches, we'd be serving incorrect
 * data to the smgr.
 */
@@ -576,7 +563,7 @@ prefetch_on_ps_disconnect(void)
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		uint64		ring_index = MyPState->ring_receive;
+		uint64 ring_index = MyPState->ring_receive;

 		slot = GetPrfSlot(ring_index);

@@ -606,7 +593,7 @@ prefetch_set_unused(uint64 ring_index)
 	PrefetchRequest *slot = GetPrfSlot(ring_index);

 	if (ring_index < MyPState->ring_last)
-		return;					/* Should already be unused */
+		return; /* Should already be unused */

 	Assert(MyPState->ring_unused > ring_index);

@@ -637,11 +624,7 @@ prefetch_set_unused(uint64 ring_index)
 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
 		prefetch_cleanup_trailing_unused();
-
-	/*
-	 * ... and try to store the buffered responses more compactly if > 12.5%
-	 * of the buffer is gaps
-	 */
+	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
 	else if (ReceiveBufferNeedsCompaction())
 		compact_prefetch_buffers();
 }
@@ -649,7 +632,7 @@ prefetch_set_unused(uint64 ring_index)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	bool		found;
+	bool found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -667,22 +650,21 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
+		XLogRecPtr lsn = neon_get_request_lsn(
+			&request.req.latest,
+			BufTagGetNRelFileInfo(slot->buftag),
+			slot->buftag.forkNum,
+			slot->buftag.blockNum
+		);
 		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
+		 * Note: effective_request_lsn is potentially higher than the requested
+		 * LSN, but still correct:
+		 * 
 		 * We know there are no changes between the actual requested LSN and
 		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
+		 * have been in cache and evicted between those LSN values, which
+		 * then would have had to result in a larger request LSN for this page.
+		 * 
 		 * It is possible that a concurrent backend loads the page, modifies
 		 * it and then evicts it again, but the LSN of that eviction cannot be
 		 * smaller than the current WAL insert/redo pointer, which is already
@@ -719,7 +701,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 * prefetch_register_buffer() - register and prefetch buffer
 *
 * Register that we may want the contents of BufferTag in the near future.
- *
+ * 
 * If force_latest and force_lsn are not NULL, those values are sent to the
 * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
 * to fill in these values manually.
@@ -731,14 +713,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-Retry:
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -758,10 +740,7 @@ Retry:
 		 */
 		if (force_latest && force_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
 			if (*force_latest)
 			{
 				if (*force_lsn > slot->effective_request_lsn)
@@ -772,11 +751,7 @@ Retry:
 				}

 			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
 			{
 				if (*force_lsn != slot->effective_request_lsn)
@@ -823,8 +798,7 @@ Retry:
 	 */
 	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		uint64		cleanup_index = MyPState->ring_last;
-
+		uint64 cleanup_index = MyPState->ring_last;
 		slot = GetPrfSlot(cleanup_index);

 		Assert(slot->status != PRFS_UNUSED);
@@ -839,10 +813,7 @@ Retry:
 		}
 		else
 		{
-			/*
-			 * We have the slot for ring_last, so that must still be in
-			 * progress
-			 */
+			/* We have the slot for ring_last, so that must still be in progress */
 			switch (slot->status)
 			{
 				case PRFS_REQUESTED:
@@ -861,8 +832,8 @@ Retry:
 	}

 	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-	 * we can insert the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
 	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
@@ -888,10 +859,7 @@ Retry:
 	{
 		if (!page_server->flush())
 		{
-			/*
-			 * Prefetch set is reset in case of error, so we should try to
-			 * register our request once again
-			 */
+			/* Prefetch set is reset in case of error, so we should try to register our request once again */
 			goto Retry;
 		}
 		MyPState->ring_flush = MyPState->ring_unused;
@@ -903,10 +871,8 @@ Retry:
 static NeonResponse *
 page_server_request(void const *req)
 {
-	NeonResponse *resp;
-
-	do
-	{
+	NeonResponse* resp;
+	do {
 		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
@@ -918,7 +884,7 @@ page_server_request(void const *req)


 StringInfoData
-nm_pack_request(NeonRequest *msg)
+nm_pack_request(NeonRequest * msg)
 {
 	StringInfoData s;

@@ -1034,7 +1000,7 @@ nm_unpack_response(StringInfo s)
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
-
+				
 				Assert(msg_resp->tag == T_NeonGetPageResponse);

 				resp = (NeonResponse *) msg_resp;
@@ -1090,7 +1056,7 @@ nm_unpack_response(StringInfo s)

 /* dump to json for debugging / error reporting purposes */
 char *
-nm_to_string(NeonMessage *msg)
+nm_to_string(NeonMessage * msg)
 {
 	StringInfoData s;

@@ -1219,7 +1185,7 @@ nm_to_string(NeonMessage *msg)
 * directly because it skips the logging if the LSN is new enough.
 */
 static XLogRecPtr
-log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
+log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 Page page, bool page_std)
 {
 	PGAlignedBlock copied_buffer;
@@ -1242,10 +1208,11 @@ PageIsEmptyHeapPage(char *buffer)
 }

 static void
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
+				 char *buffer, bool force)
 #else
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
+				 const char *buffer, bool force) 
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
@@ -1345,23 +1312,24 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 void
 neon_init(void)
 {
-	Size		prfs_size;
+	Size prfs_size;

 	if (MyPState != NULL)
 		return;

-	prfs_size = offsetof(PrefetchState, prf_buffer) +
-		sizeof(PrefetchRequest) * readahead_buffer_size;
+	prfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);

 	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-
+	
 	MyPState->n_unused = readahead_buffer_size;

 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
 										 SLAB_DEFAULT_BLOCK_SIZE * 17,
 										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext, 
 											 "NeonSMGR/errors",
 											 ALLOCSET_DEFAULT_SIZES);
 	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
@@ -1601,14 +1569,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
-	 * Note that in REDO, this is called to make sure the relation fork
-	 * exists, but it does not truncate the relation. So, we can only update
-	 * the relsize if it didn't exist before.
-	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
 	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's file length
-	 * considerations, and as file extension isn't (perfectly) logged, we need
-	 * to take care of that before we hit file size checks.
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
@@ -1684,7 +1652,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #endif
 {
 	XLogRecPtr	lsn;
-	BlockNumber n_blocks = 0;
+	BlockNumber	n_blocks = 0;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1725,10 +1693,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/*
-	 * Usually Postgres doesn't extend relation on more than one page (leaving
-	 * holes). But this rule is violated in PG-15 where
-	 * CreateAndCopyRelationData call smgrextend for destination relation n
-	 * using size of source relation
+	 * Usually Postgres doesn't extend relation on more than one page
+	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
+	 * call smgrextend for destination relation n using size of source relation
 	 */
 	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
@@ -1749,13 +1716,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
-
 	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
+	 * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
+	 * An smgr_write() call will come for the buffer later, after it has been initialized
+	 * with the real page contents, and it is eventually evicted from the buffer cache.
+	 * But we need a valid LSN to the relation metadata update now.
 	 */
 	if (lsn == InvalidXLogRecPtr)
 	{
@@ -1814,9 +1779,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forkNum),
-						InvalidBlockNumber)));
+					errmsg("cannot extend file \"%s\" beyond %u blocks",
+						   relpath(reln->smgr_rlocator, forkNum),
+						   InvalidBlockNumber)));

 	/* Don't log any pages if we're not allowed to do so. */
 	if (!XLogInsertAllowed())
@@ -1898,12 +1863,12 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;

 	switch (reln->smgr_relpersistence)
 	{
-		case 0:					/* probably shouldn't happen, but ignore it */
+		case 0: /* probably shouldn't happen, but ignore it */
 		case RELPERSISTENCE_PERMANENT:
 			break;

@@ -1918,9 +1883,10 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;

-	tag.forkNum = forknum;
-	tag.blockNum = blocknum;
-
+	tag = (BufferTag) {
+		.forkNum = forknum,
+		.blockNum = blocknum
+	};
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

 	ring_index = prefetch_register_buffer(tag, NULL, NULL);
@@ -1973,21 +1939,23 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 * While function is defined in the neon extension it's used within neon_test_utils directly.
 * To avoid breaking tests in the runtime please keep function signature in sync.
 */
-void
 #if PG_MAJORVERSION_NUM < 16
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
 {
 	NeonResponse *resp;
+	BufferTag	buftag;
 	uint64		ring_index;
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
-	BufferTag	buftag =
-	{
+
+	buftag = (BufferTag) {
 		.forkNum = forkNum,
 		.blockNum = blkno,
 	};
@@ -1996,11 +1964,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	/*
 	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the page
-	 * after redo has decided it won't redo that page and updated the LwLSN
-	 * for that page. If we're in hot standby we need to take care that we
-	 * don't return until after REDO has finished replaying up to that LwLSN,
-	 * as the page should have been locked up to that point.
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
 	 *
 	 * See also the description on neon_redo_read_buffer_filter below.
 	 *
@@ -2008,7 +1977,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * concurrent failed read IOs. Those IOs should never have a request_lsn
 	 * that is as large as the WAL record we're currently replaying, if it
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found.
+	 * value of the LwLsn cache when the entry is not found. 
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
 		XLogWaitForReplayOf(request_lsn);
@@ -2026,14 +1995,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else /* the current prefetch LSN is not large enough, so drop the prefetch */
 		{
 			/*
 			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance
-			 * is large enough and a backend didn't consume all prefetch
-			 * requests.
+			 * unlikely this happens, but it can happen if prefetch distance is
+			 * large enough and a backend didn't consume all prefetch requests.
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
@@ -2060,11 +2027,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			/*
-			 * Empty our reference to the prefetch buffer's hash entry. When
-			 * we wait for prefetches, the entry reference is invalidated by
-			 * potential updates to the hash, and when we reconnect to the
-			 * pageserver the prefetch we're waiting for may be dropped, in
-			 * which case we need to retry and take the branch above.
+			 * Empty our reference to the prefetch buffer's hash entry.
+			 * When we wait for prefetches, the entry reference is invalidated by 
+			 * potential updates to the hash, and when we reconnect to the 
+			 * pageserver the prefetch we're waiting for may be dropped,
+			 * in which case we need to retry and take the branch above.
 			 */
 			entry = NULL;
 		}
@@ -2112,10 +2079,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 *	neon_read() -- Read the specified block from a relation.
 */
 void
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
+		  char *buffer)
 #else
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
+		  void *buffer)
 #endif
 {
 	bool		latest;
@@ -2250,10 +2218,11 @@ hexdump_page(char *page)
 *		use mdextend().
 */
 void
+neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
+		   char *buffer, bool skipFsync)
 #else
-neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
+		   const void *buffer, bool skipFsync)
 #endif
 {
 	XLogRecPtr	lsn;
@@ -2753,90 +2722,9 @@ smgr_init_neon(void)
 }


-static void
-neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
-{
-	BlockNumber relsize;
-
-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
-	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
-	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of
-		 * the relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the
-		 * block, which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		relsize = Max(nbresponse->n_blocks, blkno + 1);
-
-		set_cached_relsize(rinfo, forknum, relsize);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", relsize);
-	}
-}
-
-#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
-
-/*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
- */
-static BlockNumber
-get_fsm_physical_block(BlockNumber heapblk)
-{
-	BlockNumber pages;
-	int			leafno;
-	int			l;
-
-	/*
-	 * Calculate the logical page number of the first leaf page below the
-	 * given page.
-	 */
-	leafno = heapblk / SlotsPerFSMPage;
-
-	/* Count upper level nodes required to address the leaf page */
-	pages = 0;
-	for (l = 0; l < FSM_TREE_DEPTH; l++)
-	{
-		pages += leafno + 1;
-		leafno /= SlotsPerFSMPage;
-	}
-
-	/* Turn the page count into 0-based block number */
-	return pages - 1;
-}
-
-
 /*
 * Return whether we can skip the redo for this block.
- *
+ * 
 * The conditions for skipping the IO are:
 *
 * - The block is not in the shared buffers, and
@@ -2875,12 +2763,13 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecPtr	end_recptr = record->EndRecPtr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-	BlockNumber blkno;
+	BlockNumber	blkno;
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
+	BlockNumber relsize;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2894,8 +2783,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	/*
 	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers. See also
-	 * this function's top comment.
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
 	 */
 	if (!OidIsValid(NInfoGetDbOid(rinfo)))
 		return false;
@@ -2921,9 +2810,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

-	/*
-	 * we don't have the buffer in memory, update lwLsn past this record, also
-	 * evict page fro file cache
+	/* we don't have the buffer in memory, update lwLsn past this record,
+	 * also evict page fro file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);
@@ -2931,10 +2819,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	LWLockRelease(partitionLock);

-	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
-	if (forknum == MAIN_FORKNUM)
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
-		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
 	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
 	return no_redo_needed;
 }
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp)
 	if (wp->propTermHistory.entries != NULL)
 		pfree(wp->propTermHistory.entries);
 	wp->propTermHistory.entries = NULL;
-
+	
 	pfree(wp);
 }

@@ -275,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+						 sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -395,7 +395,7 @@ ResetConnection(Safekeeper *sk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+			 sk->host, sk->port, wp->api.conn_error_message(sk));

 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -489,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-						sk->host, sk->port);
+				 sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */

@@ -525,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -554,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -580,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	{
 		case WP_CONN_POLLING_OK:
 			walprop_log(LOG, "connected with node %s:%s", sk->host,
-						sk->port);
+				 sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 			/*
@@ -604,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)

 		case WP_CONN_POLLING_FAILED:
 			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+				 sk->host, sk->port, wp->api.conn_error_message(sk));

 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -641,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
 		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+			 sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -678,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)

 		case WP_EXEC_FAILED:
 			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+				 sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;

@@ -689,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
 			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-						sk->host, sk->port);
+				 sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -758,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	{
 		/* Another compute with higher term is running. */
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->greetResponse.term, wp->propTerm);
+			 sk->host, sk->port,
+			 sk->greetResponse.term, wp->propTerm);
 	}

 	/*
@@ -817,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	walprop_log(LOG,
-				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+		 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		 sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		 LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+		 LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		 LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -832,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->voteResponse.term, wp->propTerm);
+			 sk->host, sk->port,
+			 sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);

@@ -877,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
 	if (wp->truncateLsn < wp->propEpochStartLsn)
 	{
 		walprop_log(LOG,
-					"start recovery because truncateLsn=%X/%X is not "
-					"equal to epochStartLsn=%X/%X",
-					LSN_FORMAT_ARGS(wp->truncateLsn),
-					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+			 "start recovery because truncateLsn=%X/%X is not "
+			 "equal to epochStartLsn=%X/%X",
+			 LSN_FORMAT_ARGS(wp->truncateLsn),
+			 LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 		/* Perform recovery */
 		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
 			walprop_log(FATAL, "Failed to recover state");
@@ -990,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
 					walprop_log(WARNING,
-								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-								LSN_FORMAT_ARGS(wp->timelineStartLsn),
-								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						 LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						 LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -1038,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;

 	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-				wp->quorum,
-				wp->propTerm,
-				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-				LSN_FORMAT_ARGS(wp->truncateLsn));
+		 wp->quorum,
+		 wp->propTerm,
+		 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+		 wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		 LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1070,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
 											walprop_shared->mineLastElectedTerm)))
 			{
 				walprop_log(PANIC,
-							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}

 	/*
-	 * WalProposer has just elected itself and initialized history, so we can
-	 * call election callback. Usually it updates truncateLsn to fetch WAL for
-	 * logical replication.
+	 * WalProposer has just elected itself and initialized history, so
+	 * we can call election callback. Usually it updates truncateLsn to
+	 * fetch WAL for logical replication.
 	 */
 	wp->api.after_election(wp);
 }
@@ -1155,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
 			sk->startStreamingAt = wp->truncateLsn;

 			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-						LSN_FORMAT_ARGS(sk->startStreamingAt));
+				 sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+				 LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
 	}
 	else
@@ -1190,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)

 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
 	walprop_log(LOG,
-				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));

 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1355,11 +1355,11 @@ SendAppendRequests(Safekeeper *sk)
 		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);

 		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					req->endLsn - req->beginLsn,
-					LSN_FORMAT_ARGS(req->beginLsn),
-					LSN_FORMAT_ARGS(req->endLsn),
-					LSN_FORMAT_ARGS(req->commitLsn),
-					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);

 		resetStringInfo(&sk->outbuf);

@@ -1398,8 +1398,8 @@ SendAppendRequests(Safekeeper *sk)

 			case PG_ASYNC_WRITE_FAIL:
 				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-							sk->host, sk->port, FormatSafekeeperState(sk->state),
-							wp->api.conn_error_message(sk));
+					 sk->host, sk->port, FormatSafekeeperState(sk->state),
+					 wp->api.conn_error_message(sk));
 				ShutdownConnection(sk);
 				return false;
 			default:
@@ -1438,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
 			break;

 		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-					sk->appendResponse.term,
-					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-					sk->host, sk->port);
+						sk->appendResponse.term,
+						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+						sk->host, sk->port);

 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/* Another compute with higher term is running. */
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-						sk->host, sk->port,
-						sk->appendResponse.term, wp->propTerm);
+				 sk->host, sk->port,
+				 sk->appendResponse.term, wp->propTerm);
 		}

 		readAnything = true;
@@ -1493,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-						rf->currentClusterSize);
+				 rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
@@ -1501,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->last_received_lsn));
+				 LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
@@ -1509,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
@@ -1517,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1530,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
 				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-							rf->replytime, replyTimeStr);
+					 rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
 			}
@@ -1700,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)

 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk->state),
-						wp->api.conn_error_message(sk));
+				 sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1740,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
+			 sk->port, FormatSafekeeperState(sk->state));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1816,8 +1816,8 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state),
-					wp->api.conn_error_message(sk));
+			 sk->host, sk->port, FormatSafekeeperState(sk->state),
+			 wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1863,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
-						wp->api.conn_error_message(sk));
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1902,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
-						wp->api.conn_error_message(sk));
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -2008,7 +2008,7 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+			 FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2111,7 +2111,7 @@ FormatEvents(WalProposer *wp, uint32 events)
 	if (events & (~all_flags))
 	{
 		walprop_log(WARNING, "Event formatting found unexpected component %d",
-					events & (~all_flags));
+			 events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -356,8 +356,7 @@ typedef struct Safekeeper


 	/* postgres-specific fields */
-#ifndef WALPROPOSER_LIB
-
+	#ifndef WALPROPOSER_LIB
 	/*
 	 * postgres protocol connection to the WAL acceptor
 	 *
@@ -375,18 +374,17 @@ typedef struct Safekeeper
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
-#endif
+	#endif


 	/* WalProposer library specifics */
-#ifdef WALPROPOSER_LIB
-
+	#ifdef WALPROPOSER_LIB
 	/*
 	 * Buffer for incoming messages. Usually Rust vector is stored here.
 	 * Caller is responsible for freeing the buffer.
 	 */
 	StringInfoData inbuf;
-#endif
+	#endif
 } Safekeeper;

 /* Re-exported PostgresPollingStatusType */
@@ -474,7 +472,7 @@ typedef struct walproposer_api
 	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);

 	/* Start the connection, aka PQconnectStart. */
-	void		(*conn_connect_start) (Safekeeper *sk);
+	void (*conn_connect_start) (Safekeeper *sk);

 	/* Poll an asynchronous connection, aka PQconnectPoll. */
 	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
@@ -492,7 +490,7 @@ typedef struct walproposer_api
 	void		(*conn_finish) (Safekeeper *sk);

 	/*
-	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData.
+	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData. 
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
@@ -512,7 +510,7 @@ typedef struct walproposer_api
 	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
-	void		(*wal_reader_allocate) (Safekeeper *sk);
+	void (*wal_reader_allocate) (Safekeeper *sk);

 	/* Deallocate event set. */
 	void		(*free_event_set) (WalProposer *wp);
@@ -574,7 +572,7 @@ typedef struct walproposer_api
 	/*
 	 * Called right after the proposer was elected, but before it started
 	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
+	 * 
 	 * Used by logical replication to update truncateLsn.
 	 */
 	void		(*after_election) (WalProposer *wp);
@@ -628,10 +626,10 @@ typedef struct WalProposerConfig
 	uint64		systemId;

 	/* Will be passed to safekeepers in greet request. */
-	TimeLineID	pgTimeline;
+	TimeLineID  pgTimeline;

 #ifdef WALPROPOSER_LIB
-	void	   *callback_data;
+	void *callback_data;
 #endif
 } WalProposerConfig;

@@ -712,11 +710,10 @@ extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);


-#define WPEVENT		1337		/* special log level for walproposer internal
-								 * events */
+#define WPEVENT		1337	/* special log level for walproposer internal events */

 #ifdef WALPROPOSER_LIB
-extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
+void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
 #define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
 #define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -9,9 +9,8 @@
 #include "utils/datetime.h"
 #include "miscadmin.h"

-void
-ExceptionalCondition(const char *conditionName,
-					 const char *fileName, int lineNumber)
+void ExceptionalCondition(const char *conditionName,
+						  const char *fileName, int lineNumber)
 {
 	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
 			fileName, lineNumber, conditionName);
@@ -170,18 +169,17 @@ timestamptz_to_str(TimestampTz t)

 bool
 TimestampDifferenceExceeds(TimestampTz start_time,
-						   TimestampTz stop_time,
-						   int msec)
+								TimestampTz stop_time,
+								int msec)
 {
 	TimestampTz diff = stop_time - start_time;
-
 	return (diff >= msec * INT64CONST(1000));
 }

 void
-WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
+WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
 {
-	char		buf[1024];
+	char buf[1024];
 	va_list		args;

 	fmt = _(fmt);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -637,8 +637,8 @@ walprop_connect_start(Safekeeper *sk)
 	 */
 	sk->conn = palloc(sizeof(WalProposerConn));
 	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in
-										 * blocking mode */
+	sk->conn->is_nonblocking = false;	/* connections always start in blocking
+									 * mode */
 	sk->conn->recvbuf = NULL;
 }

@@ -1291,11 +1291,10 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
 	/*
 	 * Apart from walproposer, basebackup LSN page is also written out by
 	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL
-	 * buffers here to avoid dummy page overwriting correct one we download
-	 * here. Ugly, but alternatives are about the same ugly. We won't need
-	 * that if we switch to on-demand WAL download from safekeepers, without
-	 * writing to disk.
+	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
+	 * here to avoid dummy page overwriting correct one we download here. Ugly,
+	 * but alternatives are about the same ugly. We won't need that if we switch
+	 * to on-demand WAL download from safekeepers, without writing to disk.
 	 *
 	 * https://github.com/neondatabase/neon/issues/5749
 	 */
@@ -1682,17 +1681,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 static void
 walprop_pg_after_election(WalProposer *wp)
 {
-	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
+	FILE* f;
+	XLogRecPtr lrRestartLsn;

-	/* We don't need to do anything in syncSafekeepers mode. */
+	/* We don't need to do anything in syncSafekeepers mode.*/
 	if (wp->config->syncSafekeepers)
 		return;

 	/*
-	 * If there are active logical replication subscription we need to provide
-	 * enough WAL for their WAL senders based on th position of their
-	 * replication slots.
+	 * If there are active logical replication subscription we need
+	 * to provide enough WAL for their WAL senders based on th position
+	 * of their replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
@@ -1701,12 +1700,8 @@ walprop_pg_after_election(WalProposer *wp)
 		fclose(f);
 		if (lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
+			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
+			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]

 [[package]]
 name = "aiopg"
-version = "1.4.0"
+version = "1.3.4"
 description = "Postgres integration with asyncio."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
-    {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
+    {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
+    {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
 ]

 [package.dependencies]
 async-timeout = ">=3.0,<5.0"
-psycopg2-binary = ">=2.9.5"
+psycopg2-binary = ">=2.8.4"

 [package.extras]
 sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
@@ -160,71 +160,64 @@ pluggy = ">=0.4.0"

 [[package]]
 name = "async-timeout"
-version = "4.0.3"
+version = "4.0.2"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
+    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
+    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
 ]

 [[package]]
 name = "asyncpg"
-version = "0.29.0"
+version = "0.27.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
-python-versions = ">=3.8.0"
+python-versions = ">=3.7.0"
 files = [
-    {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
-    {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
-    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
-    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
-    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
-    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
-    {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
-    {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
-    {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
-    {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
-    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
-    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
-    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
-    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
-    {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
-    {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
-    {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
-    {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
-    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
-    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
-    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
-    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
-    {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
-    {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
-    {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
-    {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
-    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
-    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
-    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
-    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
-    {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
-    {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
-    {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
-    {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
-    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
-    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
-    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
-    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
-    {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
-    {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
-    {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
+    {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
+    {file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
+    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
+    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
+    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
+    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
+    {file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
+    {file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
+    {file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
+    {file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
+    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
+    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
+    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
+    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
+    {file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
+    {file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
+    {file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
+    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
+    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
+    {file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
+    {file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
+    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
+    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
+    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
+    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
+    {file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
+    {file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
+    {file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
+    {file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
+    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
+    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
+    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
+    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
+    {file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
+    {file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
+    {file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
 ]

-[package.dependencies]
-async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
-
 [package.extras]
-docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
-test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
+dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
+docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
+test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]

 [[package]]
 name = "attrs"
@@ -2483,16 +2476,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2714,4 +2697,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
+content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -4,10 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
@@ -61,7 +57,6 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
-tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -74,12 +69,13 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
-postgres-protocol.workspace = true
 smol_str.workspace = true

 workspace_hack.workspace = true
+tokio-util.workspace = true

 [dev-dependencies]
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
+postgres-protocol.workspace = true
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -62,9 +62,6 @@ pub enum AuthErrorImpl {
        Please add it to the allowed list in the Neon console."
    )]
    IpAddressNotAllowed,
-
-    #[error("Too many connections to this endpoint. Please try again later.")]
-    TooManyConnections,
 }

 #[derive(Debug, Error)]
@@ -83,10 +80,6 @@ impl AuthError {
    pub fn ip_address_not_allowed() -> Self {
        AuthErrorImpl::IpAddressNotAllowed.into()
    }
-
-    pub fn too_many_connections() -> Self {
-        AuthErrorImpl::TooManyConnections.into()
-    }
 }

 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -109,7 +102,6 @@ impl UserFacingError for AuthError {
            MissingEndpointName => self.to_string(),
            Io(_) => "Internal error".to_string(),
            IpAddressNotAllowed => self.to_string(),
-            TooManyConnections => self.to_string(),
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,11 +3,9 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
-use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;

 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
@@ -26,12 +24,31 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
-use std::net::IpAddr;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};

+/// A product of successful authentication.
+pub struct AuthSuccess<T> {
+    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
+    pub reported_auth_ok: bool,
+    /// Something to be considered a positive result.
+    pub value: T,
+}
+
+impl<T> AuthSuccess<T> {
+    /// Very similar to [`std::option::Option::map`].
+    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
+    /// a function to a contained value.
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
+        AuthSuccess {
+            reported_auth_ok: self.reported_auth_ok,
+            value: f(self.value),
+        }
+    }
+}
+
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -44,11 +61,9 @@ pub enum BackendType<'a, T> {
    /// Current Cloud API (V2).
    Console(Cow<'a, console::provider::neon::Api>, T),
    /// Local mock of Cloud API (V2).
-    #[cfg(feature = "testing")]
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
-    #[cfg(test)]
    /// Test backend.
    Test(&'a dyn TestBackend),
 }
@@ -63,10 +78,8 @@ impl std::fmt::Display for BackendType<'_, ()> {
        use BackendType::*;
        match self {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
-            #[cfg(feature = "testing")]
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            #[cfg(test)]
            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
@@ -79,10 +92,8 @@ impl<T> BackendType<'_, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(Cow::Borrowed(c), x),
-            #[cfg(feature = "testing")]
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
-            #[cfg(test)]
            Test(x) => Test(*x),
        }
    }
@@ -96,10 +107,8 @@ impl<'a, T> BackendType<'a, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(c, f(x)),
-            #[cfg(feature = "testing")]
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
-            #[cfg(test)]
            Test(x) => Test(x),
        }
    }
@@ -112,87 +121,51 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
        use BackendType::*;
        match self {
            Console(c, x) => x.map(|x| Console(c, x)),
-            #[cfg(feature = "testing")]
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
-            #[cfg(test)]
            Test(x) => Ok(Test(x)),
        }
    }
 }

-pub struct ComputeCredentials<T> {
-    pub info: ComputeUserInfo,
-    pub keys: T,
-}
-
-pub struct ComputeUserInfoNoEndpoint {
-    pub user: SmolStr,
-    pub peer_addr: IpAddr,
-    pub cache_key: SmolStr,
-}
-
-pub struct ComputeUserInfo {
-    pub endpoint: SmolStr,
-    pub inner: ComputeUserInfoNoEndpoint,
-}
-
-pub enum ComputeCredentialKeys {
-    #[cfg(feature = "testing")]
+pub enum ComputeCredentials {
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
 }

-impl TryFrom<ClientCredentials> for ComputeUserInfo {
-    // user name
-    type Error = ComputeUserInfoNoEndpoint;
-
-    fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
-        let inner = ComputeUserInfoNoEndpoint {
-            user: creds.user,
-            peer_addr: creds.peer_addr,
-            cache_key: creds.cache_key,
-        };
-        match creds.project {
-            None => Err(inner),
-            Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
-        }
-    }
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-///
-/// All authentication flows will emit an AuthenticationOk message if successful.
-async fn auth_quirks(
+async fn auth_quirks_creds(
    api: &impl console::Api,
-    extra: &ConsoleReqExtra,
-    creds: ClientCredentials,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
-    let (info, unauthenticated_password) = match creds.try_into() {
-        Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
-            (res.info, Some(res.keys))
-        }
-        Ok(info) => (info, None),
+    let maybe_success = if creds.project.is_none() {
+        // Password will be checked by the compute node later.
+        Some(hacks::password_hack(creds, client, latency_timer).await?)
+    } else {
+        None
    };

+    // Password hack should set the project name.
+    // TODO: make `creds.project` more type-safe.
+    assert!(creds.project.is_some());
    info!("fetching user's authentication info");
    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
    let AuthInfo {
        secret,
        allowed_ips,
-    } = api.get_auth_info(extra, &info).await?;
+    } = api.get_auth_info(extra, creds).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
+    if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
    let secret = secret.unwrap_or_else(|| {
@@ -200,49 +173,36 @@ async fn auth_quirks(
        // prevent malicious probing (possible due to missing protocol steps).
        // This mocked secret will never lead to successful authentication.
        info!("authentication info not found, mocking it");
-        AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
+        AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
    });

-    if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret)?;
-        let keys = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => key,
-            crate::sasl::Outcome::Failure(reason) => {
-                info!("auth backend failed with an error: {reason}");
-                return Err(auth::AuthError::auth_failed(&*info.inner.user));
-            }
-        };
-
-        // we have authenticated the password
-        client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
-
-        return Ok(ComputeCredentials { info, keys });
+    if let Some(success) = maybe_success {
+        return Ok(success);
    }

-    // -- the remaining flows are self-authenticating --
-
    // Perform cleartext auth if we're allowed to do that.
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
+        // Password will be checked by the compute node later.
+        return hacks::cleartext_hack(client, latency_timer).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, latency_timer, secret).await
+    classic::authenticate(creds, client, config, latency_timer, secret).await
 }

-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
    api: &impl console::Api,
-    extra: &ConsoleReqExtra,
-    creds: ClientCredentials,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials = auth_quirks(
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    let auth_stuff = auth_quirks_creds(
        api,
        extra,
        creds,
@@ -255,7 +215,7 @@ async fn auth_and_wake_compute(

    let mut num_retries = 0;
    let mut node = loop {
-        let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
+        let wake_res = api.wake_compute(extra, creds).await;
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -272,27 +232,27 @@ async fn auth_and_wake_compute(
        tokio::time::sleep(wait_duration).await;
    };

-    match compute_credentials.keys {
-        #[cfg(feature = "testing")]
-        ComputeCredentialKeys::Password(password) => node.config.password(password),
-        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+    match auth_stuff.value {
+        ComputeCredentials::Password(password) => node.config.password(password),
+        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
    };

-    Ok((node, compute_credentials.info))
+    Ok(AuthSuccess {
+        reported_auth_ok: auth_stuff.reported_auth_ok,
+        value: node,
+    })
 }

-impl<'a> BackendType<'a, ClientCredentials> {
+impl BackendType<'_, ClientCredentials<'_>> {
    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<SmolStr> {
+    pub fn get_endpoint(&self) -> Option<String> {
        use BackendType::*;

        match self {
            Console(_, creds) => creds.project.clone(),
-            #[cfg(feature = "testing")]
            Postgres(_, creds) => creds.project.clone(),
-            Link(_) => Some("link".into()),
-            #[cfg(test)]
-            Test(_) => Some("test".into()),
+            Link(_) => Some("link".to_owned()),
+            Test(_) => Some("test".to_owned()),
        }
    }

@@ -301,11 +261,9 @@ impl<'a> BackendType<'a, ClientCredentials> {
        use BackendType::*;

        match self {
-            Console(_, creds) => &creds.user,
-            #[cfg(feature = "testing")]
-            Postgres(_, creds) => &creds.user,
+            Console(_, creds) => creds.user,
+            Postgres(_, creds) => creds.user,
            Link(_) => "link",
-            #[cfg(test)]
            Test(_) => "test",
        }
    }
@@ -313,25 +271,26 @@ impl<'a> BackendType<'a, ClientCredentials> {
    /// Authenticate the client via the requested backend, possibly using credentials.
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
-        self,
-        extra: &ConsoleReqExtra,
+        &mut self,
+        extra: &ConsoleReqExtra<'_>,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
        latency_timer: &mut LatencyTimer,
-    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
+    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

        let res = match self {
            Console(api, creds) => {
                info!(
-                    user = &*creds.user,
+                    user = creds.user,
                    project = creds.project(),
                    "performing authentication using the console"
                );

-                let (cache_info, user_info) = auth_and_wake_compute(
-                    &*api,
+                let api = api.as_ref();
+                auth_quirks(
+                    api,
                    extra,
                    creds,
                    client,
@@ -339,19 +298,18 @@ impl<'a> BackendType<'a, ClientCredentials> {
                    config,
                    latency_timer,
                )
-                .await?;
-                (cache_info, BackendType::Console(api, user_info))
+                .await?
            }
-            #[cfg(feature = "testing")]
            Postgres(api, creds) => {
                info!(
-                    user = &*creds.user,
+                    user = creds.user,
                    project = creds.project(),
                    "performing authentication using a local postgres instance"
                );

-                let (cache_info, user_info) = auth_and_wake_compute(
-                    &*api,
+                let api = api.as_ref();
+                auth_quirks(
+                    api,
                    extra,
                    creds,
                    client,
@@ -359,21 +317,16 @@ impl<'a> BackendType<'a, ClientCredentials> {
                    config,
                    latency_timer,
                )
-                .await?;
-                (cache_info, BackendType::Postgres(api, user_info))
+                .await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                let node_info = link::authenticate(&url, client).await?;
-
-                (
-                    CachedNodeInfo::new_uncached(node_info),
-                    BackendType::Link(url),
-                )
+                link::authenticate(url, client)
+                    .await?
+                    .map(CachedNodeInfo::new_uncached)
            }
-            #[cfg(test)]
            Test(_) => {
                unreachable!("this function should never be called in the test backend")
            }
@@ -382,20 +335,16 @@ impl<'a> BackendType<'a, ClientCredentials> {
        info!("user successfully authenticated");
        Ok(res)
    }
-}

-impl BackendType<'_, ComputeUserInfo> {
    pub async fn get_allowed_ips(
        &self,
-        extra: &ConsoleReqExtra,
+        extra: &ConsoleReqExtra<'_>,
    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
        use BackendType::*;
        match self {
            Console(api, creds) => api.get_allowed_ips(extra, creds).await,
-            #[cfg(feature = "testing")]
            Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
            Link(_) => Ok(Arc::new(vec![])),
-            #[cfg(test)]
            Test(x) => x.get_allowed_ips(),
        }
    }
@@ -404,16 +353,14 @@ impl BackendType<'_, ComputeUserInfo> {
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
    pub async fn wake_compute(
        &self,
-        extra: &ConsoleReqExtra,
+        extra: &ConsoleReqExtra<'_>,
    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
        use BackendType::*;

        match self {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
-            #[cfg(feature = "testing")]
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
-            #[cfg(test)]
            Test(x) => x.wake_compute().map(Some),
        }
    }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,6 +1,6 @@
-use super::{ComputeCredentials, ComputeUserInfo};
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
-    auth::{self, backend::ComputeCredentialKeys, AuthFlow},
+    auth::{self, AuthFlow, ClientCredentials},
    compute,
    config::AuthenticationConfig,
    console::AuthSecret,
@@ -12,15 +12,14 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    creds: ComputeUserInfo,
+    creds: &ClientCredentials<'_>,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
    secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    let flow = AuthFlow::new(client);
    let scram_keys = match secret {
-        #[cfg(feature = "testing")]
        AuthSecret::Md5(_) => {
            info!("auth endpoint chooses MD5");
            return Err(auth::AuthError::bad_auth_method("MD5"));
@@ -54,7 +53,7 @@ pub(super) async fn authenticate(
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(&*creds.inner.user));
+                    return Err(auth::AuthError::auth_failed(creds.user));
                }
            };

@@ -65,9 +64,9 @@ pub(super) async fn authenticate(
        }
    };

-    Ok(ComputeCredentials {
-        info: creds,
-        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
            scram_keys,
        )),
    })
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,11 +1,7 @@
-use super::{
-    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
-};
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
-    auth::{self, AuthFlow},
-    console::AuthSecret,
+    auth::{self, AuthFlow, ClientCredentials},
    proxy::LatencyTimer,
-    sasl,
    stream::{self, Stream},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -15,42 +11,35 @@ use tracing::{info, warn};
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
-pub async fn authenticate_cleartext(
-    info: ComputeUserInfo,
+pub async fn cleartext_hack(
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
-    secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");

    // pause the timer while we communicate with the client
    let _paused = latency_timer.pause();

-    let auth_outcome = AuthFlow::new(client)
-        .begin(auth::CleartextPassword(secret))
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
        .await?
        .authenticate()
        .await?;

-    let keys = match auth_outcome {
-        sasl::Outcome::Success(key) => key,
-        sasl::Outcome::Failure(reason) => {
-            info!("auth backend failed with an error: {reason}");
-            return Err(auth::AuthError::auth_failed(&*info.inner.user));
-        }
-    };
-
-    Ok(ComputeCredentials { info, keys })
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: ComputeCredentials::Password(password),
+    })
 }

 /// Workaround for clients which don't provide an endpoint (project) name.
-/// Similar to [`authenticate_cleartext`], but there's a specific password format,
-/// and passwords are not yet validated (we don't know how to validate them!)
-pub async fn password_hack_no_authentication(
-    info: ComputeUserInfoNoEndpoint,
+/// Very similar to [`cleartext_hack`], but there's a specific password format.
+pub async fn password_hack(
+    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<ComputeCredentials<Vec<u8>>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");

    // pause the timer while we communicate with the client
@@ -59,17 +48,15 @@ pub async fn password_hack_no_authentication(
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
        .await?
-        .get_password()
+        .authenticate()
        .await?;

-    info!(project = &*payload.endpoint, "received missing parameter");
+    info!(project = &payload.endpoint, "received missing parameter");
+    creds.project = Some(payload.endpoint);

    // Report tentative success; compute node will check the password anyway.
-    Ok(ComputeCredentials {
-        info: ComputeUserInfo {
-            inner: info,
-            endpoint: payload.endpoint,
-        },
-        keys: payload.password,
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: ComputeCredentials::Password(payload.password),
    })
 }
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,3 +1,4 @@
+use super::AuthSuccess;
 use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
@@ -56,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 pub(super) async fn authenticate(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<NodeInfo> {
+) -> auth::Result<AuthSuccess<NodeInfo>> {
    let psql_session_id = new_psql_session_id();
    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);
@@ -101,9 +102,12 @@ pub(super) async fn authenticate(
        config.password(password.as_ref());
    }

-    Ok(NodeInfo {
-        config,
-        aux: db_info.aux,
-        allow_self_signed_compute: false, // caller may override
+    Ok(AuthSuccess {
+        reported_auth_ok: true,
+        value: NodeInfo {
+            config,
+            aux: db_info.aux,
+            allow_self_signed_compute: false, // caller may override
+        },
    })
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -3,12 +3,14 @@
 use crate::{
    auth::password_hack::parse_endpoint_param,
    error::UserFacingError,
-    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use smol_str::SmolStr;
-use std::{collections::HashSet, net::IpAddr};
+use std::{
+    collections::HashSet,
+    net::{IpAddr, SocketAddr},
+};
 use thiserror::Error;
 use tracing::{info, warn};

@@ -22,7 +24,7 @@ pub enum ClientCredsParseError {
         SNI ('{}') and project option ('{}').",
        .domain, .option,
    )]
-    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
+    InconsistentProjectNames { domain: String, option: String },

    #[error(
        "Common name inferred from SNI ('{}') is not known",
@@ -31,7 +33,7 @@ pub enum ClientCredsParseError {
    UnknownCommonName { cn: String },

    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(SmolStr),
+    MalformedProjectName(String),
 }

 impl UserFacingError for ClientCredsParseError {}
@@ -39,34 +41,34 @@ impl UserFacingError for ClientCredsParseError {}
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ClientCredentials {
-    pub user: SmolStr,
+pub struct ClientCredentials<'a> {
+    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<SmolStr>,
+    pub project: Option<String>,

-    pub cache_key: SmolStr,
-    pub peer_addr: IpAddr,
+    pub cache_key: String,
+    pub peer_addr: SocketAddr,
 }

-impl ClientCredentials {
+impl ClientCredentials<'_> {
    #[inline]
    pub fn project(&self) -> Option<&str> {
        self.project.as_deref()
    }
 }

-impl ClientCredentials {
+impl<'a> ClientCredentials<'a> {
    pub fn parse(
-        params: &StartupMessageParams,
+        params: &'a StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<HashSet<String>>,
-        peer_addr: IpAddr,
+        peer_addr: SocketAddr,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

        // Some parameters are stored in the startup message.
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user = get_param("user")?.into();
+        let user = get_param("user")?;

        // Project name might be passed via PG's command-line options.
        let project_option = params
@@ -80,7 +82,7 @@ impl ClientCredentials {
                    .at_most_one()
                    .ok()?
            })
-            .map(|name| name.into());
+            .map(|name| name.to_string());

        let project_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
@@ -119,7 +121,7 @@ impl ClientCredentials {
        }
        .transpose()?;

-        info!(%user, project = project.as_deref(), "credentials");
+        info!(user, project = project.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
            NUM_CONNECTION_ACCEPTED_BY_SNI
@@ -140,9 +142,8 @@ impl ClientCredentials {
        let cache_key = format!(
            "{}{}",
            project.as_deref().unwrap_or(""),
-            neon_options_str(params)
-        )
-        .into();
+            neon_options(params).unwrap_or("".to_string())
+        );

        Ok(Self {
            user,
@@ -205,10 +206,10 @@ fn project_name_valid(name: &str) -> bool {
    name.chars().all(|c| c.is_alphanumeric() || c == '-')
 }

-fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
+fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
    sni.strip_suffix(common_name)?
        .strip_suffix('.')
-        .map(SmolStr::from)
+        .map(str::to_owned)
 }

 #[cfg(test)]
@@ -220,7 +221,7 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);
@@ -235,7 +236,7 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);
@@ -250,7 +251,7 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -266,7 +267,7 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -281,7 +282,7 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -299,7 +300,7 @@ mod tests {
            ),
        ]);

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());
@@ -314,7 +315,7 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());
@@ -329,7 +330,7 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -343,13 +344,13 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

@@ -364,7 +365,7 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
            .expect_err("should fail");
        match err {
@@ -383,7 +384,7 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
            .expect_err("should fail");
        match err {
@@ -403,10 +404,13 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("project"));
-        assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
+        assert_eq!(
+            creds.cache_key,
+            "projectneon_endpoint_type:read_write neon_lsn:0/2"
+        );

        Ok(())
    }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,9 +1,8 @@
 //! Main authentication flow.

-use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
+use super::{AuthErrorImpl, PasswordHackPayload};
 use crate::{
    config::TlsServerEndPoint,
-    console::AuthSecret,
    sasl, scram,
    stream::{PqStream, Stream},
 };
@@ -51,7 +50,7 @@ impl AuthMethod for PasswordHack {

 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword(pub AuthSecret);
+pub struct CleartextPassword;

 impl AuthMethod for CleartextPassword {
    #[inline(always)]
@@ -99,7 +98,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {

 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
    /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
+    pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
@@ -118,19 +117,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {

 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
    /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
+    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        let outcome = validate_password_and_exchange(password, self.state.0)?;
-
-        if let sasl::Outcome::Success(_) = &outcome {
-            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
-        }
-
-        Ok(outcome)
+        Ok(password.to_vec())
    }
 }

@@ -159,49 +152,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
            ))
            .await?;

-        if let sasl::Outcome::Success(_) = &outcome {
-            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
-        }
-
        Ok(outcome)
    }
 }
-
-pub(super) fn validate_password_and_exchange(
-    password: &[u8],
-    secret: AuthSecret,
-) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
-    match secret {
-        #[cfg(feature = "testing")]
-        AuthSecret::Md5(_) => {
-            // test only
-            Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
-                password.to_owned(),
-            )))
-        }
-        // perform scram authentication as both client and server to validate the keys
-        AuthSecret::Scram(scram_secret) => {
-            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
-            let outcome = crate::scram::exchange(
-                &scram_secret,
-                sasl_client,
-                crate::config::TlsServerEndPoint::Undefined,
-            )?;
-
-            let client_key = match outcome {
-                sasl::Outcome::Success(client_key) => client_key,
-                sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
-            };
-
-            let keys = crate::compute::ScramKeys {
-                client_key: client_key.as_bytes(),
-                server_key: scram_secret.server_key.as_bytes(),
-            };
-
-            Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
-                tokio_postgres::config::AuthKeys::ScramSha256(keys),
-            )))
-        }
-    }
-}
--- a/Show More
+++ b/Show More