Bump postgres version

On demand downloading of SLRU segments
Fix problem with stats collector at pg14
2026-05-17 05:00:38 +00:00 · 2023-12-15 16:40:44 +02:00 · 2023-12-15 16:16:50 +02:00 · 2023-12-13 19:24:14 +02:00 · 2023-12-12 15:55:17 +02:00 · 2023-12-12 15:55:17 +02:00
146 changed files with 4845 additions and 3095 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done

      - name: Checkout
        uses: actions/checkout@v3
@@ -1097,6 +1101,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done

      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,6 +142,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done

      - name: Checkout
        uses: actions/checkout@v4
@@ -238,6 +242,20 @@ jobs:
      options: --init

    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
      - name: Checkout
        uses: actions/checkout@v4
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ test_output/
 *.o
 *.so
 *.Po
+
+# pgindent typedef lists
+*.list
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -44,6 +44,12 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -178,7 +184,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
 dependencies = [
 "concurrent-queue",
- "event-listener",
+ "event-listener 2.5.3",
 "futures-core",
 ]

@@ -199,11 +205,13 @@ dependencies = [

 [[package]]
 name = "async-lock"
-version = "2.8.0"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
+checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
 dependencies = [
- "event-listener",
+ "event-listener 4.0.0",
+ "event-listener-strategy",
+ "pin-project-lite",
 ]

 [[package]]
@@ -686,9 +694,9 @@ dependencies = [

 [[package]]
 name = "azure_core"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467"
+checksum = "4ccd63c07d1fbfb3d4543d7ea800941bf5a30db1911b9b9e4db3b2c4210a434f"
 dependencies = [
 "async-trait",
 "base64 0.21.1",
@@ -713,9 +721,9 @@ dependencies = [

 [[package]]
 name = "azure_identity"
-version = "0.16.2"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
+checksum = "8bd7ea32ca7eb66ff4757f83baac702ff11d469e5de365b6bc6f79f9c25d3436"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -734,9 +742,9 @@ dependencies = [

 [[package]]
 name = "azure_storage"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97"
+checksum = "83ca0a07f89fd72a006da4713e93af3d6c44a693e61a1c3c2e7985de39c182e8"
 dependencies = [
 "RustyXML",
 "async-trait",
@@ -756,9 +764,9 @@ dependencies = [

 [[package]]
 name = "azure_storage_blobs"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4"
+checksum = "8096c04d370118323c42b2752aa1883e4880a56ef65239f317b359f263b6e194"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -890,7 +898,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
 "memchr",
 "once_cell",
- "regex-automata",
+ "regex-automata 0.1.10",
 "serde",
 ]

@@ -1680,6 +1688,27 @@ version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"

+[[package]]
+name = "event-listener"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "event-listener-strategy"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
+dependencies = [
+ "event-listener 4.0.0",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "fail"
 version = "0.5.1"
@@ -2042,6 +2071,10 @@ name = "hashbrown"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]

 [[package]]
 name = "hashlink"
@@ -2533,7 +2566,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
 dependencies = [
- "regex-automata",
+ "regex-automata 0.1.10",
 ]

 [[package]]
@@ -2559,9 +2592,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"

 [[package]]
 name = "memoffset"
@@ -2634,14 +2667,14 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.6"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
 dependencies = [
 "libc",
 "log",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -3644,7 +3677,7 @@ dependencies = [
 "serde_json",
 "sha2",
 "smol_str",
- "socket2 0.5.3",
+ "socket2 0.5.5",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -3668,9 +3701,9 @@ dependencies = [

 [[package]]
 name = "quick-xml"
-version = "0.30.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
 dependencies = [
 "memchr",
 "serde",
@@ -3810,13 +3843,14 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.8.2"
+version = "1.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
 dependencies = [
 "aho-corasick",
 "memchr",
- "regex-syntax 0.7.2",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
 ]

 [[package]]
@@ -3828,6 +3862,17 @@ dependencies = [
 "regex-syntax 0.6.29",
 ]

+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax 0.8.2",
+]
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -3836,9 +3881,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"

 [[package]]
 name = "regex-syntax"
-version = "0.7.2"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"

 [[package]]
 name = "relative-path"
@@ -3864,6 +3909,7 @@ dependencies = [
 "bytes",
 "camino",
 "camino-tempfile",
+ "futures",
 "futures-util",
 "http-types",
 "hyper",
@@ -4291,6 +4337,7 @@ dependencies = [
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
+ "tokio-util",
 "toml_edit",
 "tracing",
 "url",
@@ -4731,9 +4778,9 @@ dependencies = [

 [[package]]
 name = "socket2"
-version = "0.5.3"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
+checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
 dependencies = [
 "libc",
 "windows-sys 0.48.0",
@@ -5080,18 +5127,18 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.28.1"
+version = "1.34.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
+checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
 dependencies = [
- "autocfg",
+ "backtrace",
 "bytes",
 "libc",
 "mio",
 "num_cpus",
 "pin-project-lite",
 "signal-hook-registry",
- "socket2 0.4.9",
+ "socket2 0.5.5",
 "tokio-macros",
 "windows-sys 0.48.0",
 ]
@@ -5108,9 +5155,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5145,7 +5192,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
- "socket2 0.5.3",
+ "socket2 0.5.5",
 "tokio",
 "tokio-util",
 ]
@@ -5214,13 +5261,16 @@ dependencies = [

 [[package]]
 name = "tokio-util"
-version = "0.7.8"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
 "bytes",
 "futures-core",
+ "futures-io",
 "futures-sink",
+ "futures-util",
+ "hashbrown 0.14.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -6216,7 +6266,8 @@ dependencies = [
 "prost",
 "rand 0.8.5",
 "regex",
- "regex-syntax 0.7.2",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
 "reqwest",
 "ring 0.16.20",
 "rustls",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,10 +38,10 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.16"
-azure_identity = "0.16"
-azure_storage = "0.16"
-azure_storage_blobs = "0.16"
+azure_core = "0.17"
+azure_identity = "0.17"
+azure_storage = "0.17"
+azure_storage_blobs = "0.17"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-regex = "1.4"
+regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7", features = ["io"] }
+tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,10 +387,20 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
-    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
--- a/38
+++ b/38
@@ -260,6 +260,44 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

+postgres-%-pg-bsd-indent: postgres-%
+	+@echo "Compiling pg_bsd_indent"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
+
+# Create typedef list for the core. Note that generally it should be combined with
+# buildfarm one to cover platform specific stuff.
+# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
+postgres-%-typedefs.list: postgres-%
+	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
+
+# Indent postgres. See src/tools/pgindent/README for details.
+.PHONY: postgres-%-pgindent
+postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+	+@echo merge with buildfarm typedef to cover all platforms
+	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
+		REL_16_STABLE list misses PGSemaphoreData
+	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
+	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
+		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	+@echo note: you might want to run it on selected files/dirs instead.
+	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
+		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
+	rm -f pg*.BAK
+
+# Indent pxgn/neon.
+.PHONY: pgindent
+neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
+
+
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,7 +274,13 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
            delay_exit = true;
            None
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
-            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
-            &[],
-        )?
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
-            replication: Some(row.get("rolreplication")),
-            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -252,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-                || !r.bypassrls.unwrap_or(false)
-                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -285,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                // This can be run on /every/ role! Not just ones created through the console.
+                // This means that if you add some funny ALTER here that adds a permission,
+                // this will get run even on user-created roles! This will result in different
+                // behavior before and after a spec gets reapplied. The below ALTER as it stands
+                // now only grants LOGIN and changes the password. Please do not allow this branch
+                // to do anything silly.
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
+                // This branch only runs when roles are created through the console, so it is
+                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
+                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
        // TODO(sharding): make this shard-aware
        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
            let valid = tenant_state.generation == req_tenant.gen;
+            tracing::info!(
+                "handle_validate: {}(gen {}): valid={valid} (latest {})",
+                req_tenant.id,
+                req_tenant.gen,
+                tenant_state.generation
+            );
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
                valid,
@@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

+    tracing::info!(
+        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
+        attach_req.tenant_id,
+        tenant_state.generation,
+        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+    );
+
    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,8 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
-use pageserver_api::models::{LocationConfig, LocationConfigMode, TimelineInfo};
-use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -27,7 +26,6 @@ use safekeeper_api::{
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
 use std::collections::{BTreeSet, HashMap};
-use std::num::ParseIntError;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -99,22 +97,6 @@ struct TimelineTreeEl {
    pub children: BTreeSet<TimelineId>,
 }

-/// Helper for CLI args that contain a comma-separate list of NodeId
-fn parse_ids_arg(
-    matches: &ArgMatches,
-    arg: &str,
-) -> Result<Option<Vec<NodeId>>, std::num::ParseIntError> {
-    if let Some(id_str) = matches.get_one::<String>(arg) {
-        let r: Result<Vec<_>, ParseIntError> = id_str
-            .split(',')
-            .map(|ps_id| u64::from_str(str::trim(ps_id)).map(NodeId))
-            .collect();
-        r.map(Some)
-    } else {
-        Ok(Some(vec![DEFAULT_PAGESERVER_ID]))
-    }
-}
-
 // Main entry point for the 'neon_local' CLI utility
 //
 // This utility helps to manage neon installation. That includes following:
@@ -186,7 +168,7 @@ fn print_timelines_tree(
                    info: t.clone(),
                    children: BTreeSet::new(),
                    name: timeline_name_mappings
-                        .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
+                        .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
                },
            )
        })
@@ -392,10 +374,9 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
+    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
-            // TODO: make command aware of multiple pageservers
-            let pageserver = get_default_pageserver(env);
            for t in pageserver.tenant_list()? {
                println!("{} {:?}", t.id, t.state);
            }
@@ -406,94 +387,38 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

-            let shard_count: u8 = create_match
-                .get_one::<u8>("shard-count")
-                .cloned()
-                .unwrap_or(1);
-
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            // We will create an initial timeline for the new tenant
-            let new_timeline_id =
-                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
+            let generation = if env.control_plane_api.is_some() {
+                // We must register the tenant with the attachment service, so
+                // that when the pageserver restarts, it will be re-attached.
+                let attachment_service = AttachmentService::from_env(env);
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+            } else {
+                None
+            };
+
+            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            println!("tenant {tenant_id} successfully created on the pageserver");
+
+            // Create an initial timeline for the new tenant
+            let new_timeline_id = parse_timeline_id(create_match)?;
            let pg_version = create_match
                .get_one::<u32>("pg-version")
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            // TODO: implement ability for one pageserver to hold multiple
-            // shards for the same tenant.  Until then, we must place each
-            // shard on a different pageserver.
-            assert!(env.pageservers.len() >= shard_count as usize);
-
-            let cfg_shard_count = if shard_count > 1 {
-                shard_count
-            } else {
-                // For single-sharded mode, use the legacy unsharded configuration.  This avoids
-                // breaking any existing tests that assume legacy unsharded storage paths
-                0
-            };
-
-            for shard_number in 0..shard_count {
-                let ps_conf = env.pageservers.get(shard_number as usize).unwrap();
-                let pageserver = PageServerNode::from_env(env, ps_conf);
-
-                // TODO: per-shard generations
-                let generation = if env.control_plane_api.is_some() {
-                    // We must register the tenant with the attachment service, so
-                    // that when the pageserver restarts, it will be re-attached.
-                    let attachment_service = AttachmentService::from_env(env);
-                    attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
-                } else {
-                    None
-                };
-
-                // TODO: shard-aware POST /v1/tenant.  Currently tenant creation on the
-                // pageserver is a no-op, but we shouldn't skip the command entirely.
-
-                let tenant_conf = PageServerNode::build_config(tenant_conf.clone())?;
-
-                let tenant_shard_id = TenantShardId {
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: ShardCount(cfg_shard_count),
-                    tenant_id,
-                };
-
-                let location_conf = LocationConfig {
-                    shard_count: cfg_shard_count,
-                    shard_number,
-                    shard_stripe_size: 32768,
-                    mode: LocationConfigMode::AttachedSingle,
-                    generation,
-                    secondary_conf: None,
-                    tenant_conf,
-                };
-                pageserver.location_config(tenant_shard_id, location_conf, None)?;
-                println!(
-                    "tenant {tenant_id} successfully created on pageserver {}",
-                    pageserver.conf.id
-                );
-            }
-
-            for shard_number in 0..shard_count {
-                let ps_conf = env.pageservers.get(shard_number as usize).unwrap();
-                let pageserver = PageServerNode::from_env(env, ps_conf);
-                let tenant_shard_id = TenantShardId {
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: ShardCount(cfg_shard_count),
-                    tenant_id,
-                };
-
-                pageserver.timeline_create(
-                    tenant_shard_id,
-                    Some(new_timeline_id),
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )?;
-            }
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id,
+                None,
+                None,
+                Some(pg_version),
+                None,
+            )?;
+            let new_timeline_id = timeline_info.timeline_id;
+            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -501,7 +426,9 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                new_timeline_id,
            )?;

-            println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
+            println!(
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
+            );

            if create_match.get_flag("set-default") {
                println!("Setting tenant {tenant_id} as a default one");
@@ -521,8 +448,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

-            // TODO: make command aware of multiple pageservers
-            let pageserver = get_default_pageserver(env);
            pageserver
                .tenant_config(tenant_id, tenant_conf)
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
@@ -566,7 +491,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let new_timeline_id_opt = parse_timeline_id(create_match)?;

            let timeline_info = pageserver.timeline_create(
-                TenantShardId::unsharded(tenant_id),
+                tenant_id,
                new_timeline_id_opt,
                None,
                None,
@@ -629,7 +554,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
-                vec![DEFAULT_PAGESERVER_ID],
+                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -654,7 +579,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let timeline_info = pageserver.timeline_create(
-                TenantShardId::unsharded(tenant_id),
+                tenant_id,
                None,
                start_lsn,
                Some(ancestor_timeline_id),
@@ -779,8 +704,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

-            let pageserver_ids = parse_ids_arg(sub_args, "endpoint-pageserver-id")?
-                .unwrap_or(vec![DEFAULT_PAGESERVER_ID]);
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -808,7 +738,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
-                pageserver_ids,
+                pageserver_id,
            )?;
        }
        "start" => {
@@ -816,14 +746,29 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let pageservers = parse_ids_arg(sub_args, "endpoint-pageserver-id")?
-                .unwrap_or(vec![DEFAULT_PAGESERVER_ID]);
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers = parse_ids_arg(sub_args, "safekeepers")?
-                .unwrap_or_else(|| env.safekeepers.iter().map(|sk| sk.id).collect());
+            let safekeepers =
+                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+                    let mut safekeepers: Vec<NodeId> = Vec::new();
+                    for sk_id in safekeepers_str.split(',').map(str::trim) {
+                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
+                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
+                        })?);
+                        safekeepers.push(sk_id);
+                    }
+                    safekeepers
+                } else {
+                    env.safekeepers.iter().map(|sk| sk.id).collect()
+                };

            let endpoint = cplane
                .endpoints
@@ -836,8 +781,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                endpoint.timeline_id,
            )?;

-            // We assume that all pageservers have the same auth conf
-            let ps_conf = env.get_pageserver_conf(pageservers[0])?;
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -857,21 +801,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_ids: Option<Result<Vec<NodeId>, _>> = sub_args
-                .get_many::<String>("endpoint-pageserver-id")
-                .map(|ids| {
-                    ids.map(|id_str| id_str.parse().context("while parsing pageserver id"))
-                        .map(|r| r.map(NodeId))
-                        .collect()
-                });
-
-            let pageserver_ids = match pageserver_ids {
-                Some(Ok(v)) => Ok(Some(v)),
-                Some(Err(e)) => Err(e),
-                None => Ok(None),
-            }?;
-
-            endpoint.reconfigure(pageserver_ids)?;
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
+                } else {
+                    None
+                };
+            endpoint.reconfigure(pageserver_id)?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -1375,7 +1313,6 @@ fn cli() -> Command {
                .arg(pg_version_arg.clone())
                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                )
            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,7 +67,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageservers: Vec<NodeId>,
+    pageserver_id: NodeId,
 }

 //
@@ -82,33 +82,6 @@ pub struct ComputeControlPlane {
    env: LocalEnv,
 }

-fn load_pageservers(
-    env: &LocalEnv,
-    pageserver_ids: &Vec<NodeId>,
-) -> anyhow::Result<Vec<PageServerNode>> {
-    let mut pageservers = Vec::new();
-    for ps_id in pageserver_ids {
-        let pageserver = env
-            .get_pageserver_conf(*ps_id)
-            .map(|conf| PageServerNode::from_env(env, conf))?;
-        pageservers.push(pageserver);
-    }
-    Ok(pageservers)
-}
-
-fn build_pageserver_connstr(pageservers: &[PageServerNode]) -> String {
-    pageservers
-        .iter()
-        .map(|ps| {
-            let config = ps.pg_connection_config.clone();
-            let (host, port) = (config.host(), config.port());
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        })
-        .collect::<Vec<_>>()
-        .join(",")
-}
-
 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
@@ -146,16 +119,19 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageservers: Vec<NodeId>,
+        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let pageserver =
+            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageservers: load_pageservers(&self.env, &pageservers)?,
+            pageserver,
            timeline_id,
            mode,
            tenant_id,
@@ -181,7 +157,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                pageservers,
+                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -240,7 +216,7 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageservers: Vec<PageServerNode>,
+    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
@@ -263,14 +239,15 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageservers: Vec<PageServerNode> = load_pageservers(env, &conf.pageservers)?;
+        let pageserver =
+            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);

        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageservers,
+            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
@@ -505,7 +482,13 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = build_pageserver_connstr(&self.pageservers);
+        let pageserver_connstring = {
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());
+
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        };
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
@@ -675,7 +658,7 @@ impl Endpoint {
        }
    }

-    pub fn reconfigure(&self, pageservers: Option<Vec<NodeId>>) -> Result<()> {
+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -685,20 +668,23 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        if let Some(pageservers) = pageservers {
+        if let Some(pageserver_id) = pageserver_id {
            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
            let mut endpoint_conf: EndpointConf = {
                let file = std::fs::File::open(&endpoint_config_path)?;
                serde_json::from_reader(file)?
            };
-            endpoint_conf.pageservers = pageservers.clone();
+            endpoint_conf.pageserver_id = pageserver_id;
            std::fs::write(
                endpoint_config_path,
                serde_json::to_string_pretty(&endpoint_conf)?,
            )?;

-            let pageservers = load_pageservers(&self.env, &pageservers)?;
-            spec.pageserver_connstring = Some(build_pageserver_connstr(&pageservers));
+            let pageserver =
+                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+            let ps_http_conf = &pageserver.pg_connection_config;
+            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
+            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
        }

        let client = reqwest::blocking::Client::new();
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -340,8 +340,15 @@ impl PageServerNode {
            .json()?)
    }

-    pub fn build_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
-        Ok(models::TenantConfig {
+    pub fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let mut settings = settings.clone();
+
+        let config = models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -400,16 +407,8 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
-        })
-    }
+        };

-    pub fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let config = Self::build_config(settings.clone())?;
        let request = models::TenantCreateRequest {
            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
@@ -522,18 +521,15 @@ impl PageServerNode {

    pub fn location_config(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
-        let req_body = TenantLocationConfigRequest {
-            tenant_shard_id,
-            config,
-        };
+        let req_body = TenantLocationConfigRequest { tenant_id, config };

        let path = format!(
            "{}/tenant/{}/location_config",
-            self.http_base_url, tenant_shard_id
+            self.http_base_url, tenant_id
        );
        let path = if let Some(flush_ms) = flush_ms {
            format!("{}?flush_ms={}", path, flush_ms.as_millis())
@@ -564,7 +560,7 @@ impl PageServerNode {

    pub fn timeline_create(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        new_timeline_id: Option<TimelineId>,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
@@ -576,7 +572,7 @@ impl PageServerNode {

        self.http_request(
            Method::POST,
-            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_shard_id),
+            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
        )?
        .json(&models::TimelineCreateRequest {
            new_timeline_id,
@@ -589,11 +585,11 @@ impl PageServerNode {
        .error_from_body()?
        .json::<Option<TimelineInfo>>()
        .with_context(|| {
-            format!("Failed to parse timeline creation response for tenant id: {tenant_shard_id}")
+            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
        })?
        .with_context(|| {
            format!(
-                "No timeline id was found in the timeline creation response for tenant {tenant_shard_id}"
+                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
            )
        })
    }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,7 +11,6 @@ use crate::{
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
-use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -109,9 +108,6 @@ pub fn migrate_tenant(
        }
    }

-    // No support for sharding in this function yet
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
    let previous = attachment_service.inspect(tenant_id)?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
@@ -121,7 +117,7 @@ pub fn migrate_tenant(
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_shard_id, dest_conf, None)?;
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
            println!("✅ Migration complete");
            return Ok(());
        }
@@ -130,7 +126,7 @@ pub fn migrate_tenant(

        let stale_conf =
            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps.location_config(tenant_shard_id, stale_conf, Some(Duration::from_secs(10)))?;
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;

        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
    }
@@ -139,7 +135,7 @@ pub fn migrate_tenant(
    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);

    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_shard_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
@@ -153,7 +149,7 @@ pub fn migrate_tenant(
                "🔁 Reconfiguring endpoint {} to use pageserver {}",
                endpoint_name, dest_ps.conf.id
            );
-            endpoint.reconfigure(Some(vec![dest_ps.conf.id]))?;
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
        }
    }

@@ -169,7 +165,7 @@ pub fn migrate_tenant(
        let found = other_ps_tenants
            .into_iter()
            .map(|t| t.id)
-            .any(|i| i == tenant_id);
+            .any(|i| i.tenant_id == tenant_id);
        if !found {
            continue;
        }
@@ -185,7 +181,7 @@ pub fn migrate_tenant(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps.location_config(tenant_shard_id, secondary_conf, None)?;
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
    }

    println!(
@@ -193,7 +189,7 @@ pub fn migrate_tenant(
        dest_ps.conf.id
    );
    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_shard_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    println!("✅ Migration complete");

--- a/demo_sharding.sh
+++ b/demo_sharding.sh
@@ -1,21 +0,0 @@
-
-
-export RUST_LOG=DEBUG
-SHARDS=4
-PAGESERVERS=`seq -s , 1 $SHARDS`
-SCALE=10
-ARGS=--features=testing
-
-set -e
-
-set +e
-cargo neon $ARGS stop ; killall -9 storage_broker ; killall -9 safekeeper ; killall -9 pageserver ; killall -9 postgres ; killall -9 attachment_service ; rm -rf .neon
-set -e
-
-cargo build --package=pageserver && cargo neon $ARGS init --num-pageservers=$SHARDS && RUST_LOG=debug cargo neon $ARGS start && cargo neon $ARGS tenant create --shard-count=$SHARDS --tenant-id=1f359dd625e519a1a4e8d7509690f6fc --timeline-id=3d34095be52fec4c44a92e774c573b57 --set-default
-
-cargo neon $ARGS endpoint create --pageserver-id=$PAGESERVERS && cargo neon endpoint start --pageserver-id=$PAGESERVERS ep-main
-
-pgbench postgres -i -h 127.0.0.1 -p 55432 -U cloud_admin -s $SCALE
-
-du -sh .neon/local_fs_remote_storage/pageserver/tenants/1f359dd625e519a1a4e8d7509690f6fc*
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -207,8 +207,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
-    pub replication: Option<bool>,
-    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,6 +5,7 @@ use std::{
 };

 use byteorder::{BigEndian, ReadBytesExt};
+use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use strum_macros;
@@ -293,7 +294,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_shard_id: TenantShardId,
+    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -323,6 +324,7 @@ impl TenantConfigRequest {

 #[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
+    #[serde(default)]
    pub config: TenantAttachConfig,
    #[serde(default)]
    pub generation: Option<u32>,
@@ -330,7 +332,7 @@ pub struct TenantAttachRequest {

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
    #[serde(flatten)]
@@ -356,7 +358,7 @@ pub enum TenantAttachmentStatus {

 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    pub id: TenantId,
+    pub id: TenantShardId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
@@ -368,7 +370,7 @@ pub struct TenantInfo {
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
    pub timeline_id: TimelineId,

    pub ancestor_timeline_id: Option<TimelineId>,
@@ -384,6 +386,9 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,

+    /// The LSN from the start of the root timeline (never changes)
+    pub initdb_lsn: Lsn,
+
    pub current_logical_size: u64,
    pub current_logical_size_is_accurate: bool,

@@ -566,6 +571,7 @@ pub enum PagestreamFeMessage {
    Nblocks(PagestreamNblocksRequest),
    GetPage(PagestreamGetPageRequest),
    DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
 }

 // Wrapped in libpq CopyData
@@ -575,6 +581,7 @@ pub enum PagestreamBeMessage {
    GetPage(PagestreamGetPageResponse),
    Error(PagestreamErrorResponse),
    DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
 }

 #[derive(Debug, PartialEq, Eq)]
@@ -606,6 +613,14 @@ pub struct PagestreamDbSizeRequest {
    pub dbnode: u32,
 }

+#[derive(Debug, PartialEq, Eq)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub kind: u8,
+    pub segno: u32,
+}
+
 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
    pub exists: bool,
@@ -621,6 +636,11 @@ pub struct PagestreamGetPageResponse {
    pub page: Bytes,
 }

+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub segment: Bytes,
+}
+
 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
    pub message: String,
@@ -673,6 +693,14 @@ impl PagestreamFeMessage {
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(4);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
        }

        bytes.into()
@@ -723,6 +751,14 @@ impl PagestreamFeMessage {
                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
+            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
@@ -757,6 +793,12 @@ impl PagestreamBeMessage {
                bytes.put_u8(104); /* tag from pagestore_client.h */
                bytes.put_i64(resp.db_size);
            }
+
+            Self::GetSlruSegment(resp) => {
+                bytes.put_u8(105); /* tag from pagestore_client.h */
+                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                bytes.put(&resp.segment[..]);
+            }
        }

        bytes.into()
@@ -822,7 +864,7 @@ mod tests {
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
        let original_active = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
@@ -839,7 +881,7 @@ mod tests {
        });

        let original_broken = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Broken {
                reason: "reason".into(),
                backtrace: "backtrace info".into(),
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -108,9 +108,22 @@ impl RelTag {
 /// These files are divided into segments, which are divided into
 /// pages of the same BLCKSZ as used for relation files.
 ///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    strum_macros::FromRepr,
+    Hash,
+    Serialize,
+    Deserialize,
+    PartialEq,
+    Eq,
+    PartialOrd,
+    Ord,
+)]
+#[repr(u8)]
 pub enum SlruKind {
-    Clog,
+    Clog = 0,
    MultiXactMembers,
    MultiXactOffsets,
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -73,19 +73,33 @@ impl TenantShardId {
        )
    }

-    pub fn shard_slug(&self) -> String {
-        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+}
+
+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
    }
 }

 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
-            write!(
-                f,
-                "{}-{:02x}{:02x}",
-                self.tenant_id, self.shard_number.0, self.shard_count.0
-            )
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,10 +16,11 @@ aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
+futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util.workspace = true
+tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,24 @@
 //! Azure Blob Storage wrapper

+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
+use std::pin::Pin;
 use std::sync::Arc;
-use std::{borrow::Cow, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use bytes::Bytes;
+use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::StatusCode;
-use tokio::io::AsyncRead;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
            StorageCredentials::token_credential(Arc::new(token_credential))
        };

-        let builder = ClientBuilder::new(account, credentials);
+        // we have an outer retry
+        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());

        let client = builder.container_client(azure_config.container_name.to_owned());

@@ -116,7 +120,8 @@ impl AzureBlobStorage {
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
+
+        let mut bufs = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
            if let Some(blob_meta) = part.blob.metadata {
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
                .collect()
                .await
                .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
+            bufs.push(data);
        }
        Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
+            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
            metadata: Some(StorageMetadata(metadata)),
        })
    }
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(res)
    }
+
    async fn upload(
        &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Put).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(to));

-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
-        // into RAM here, but use the streaming interface. For that,
-        // we'd have to change the interface though...
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::with_capacity(data_size_bytes);
-        tokio::io::copy(&mut from, &mut buf).await?;
-        let body = azure_core::Body::Bytes(buf.into());
+        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+            Box::pin(from);
+
+        let from = NonSeekableStream::new(from, data_size_bytes);
+
+        let body = azure_core::Body::SeekableStream(Box::new(from));

        let mut builder = blob_client.put_block_blob(body);

@@ -312,3 +317,153 @@ impl RemoteStorage for AzureBlobStorage {
        Ok(())
    }
 }
+
+pin_project_lite::pin_project! {
+    /// Hack to work around not being able to stream once with azure sdk.
+    ///
+    /// Azure sdk clones streams around with the assumption that they are like
+    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
+    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
+    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
+    /// seekable, but we can also just re-try the request easier.
+    #[project = NonSeekableStreamProj]
+    enum NonSeekableStream<S> {
+        /// A stream wrappers initial form.
+        ///
+        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
+        /// clone before first request, then this must be changed.
+        Initial {
+            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
+            len: usize,
+        },
+        /// The actually readable variant, produced by cloning the Initial variant.
+        ///
+        /// The sdk currently always clones once, even without retry policy.
+        Actual {
+            #[pin]
+            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
+            len: usize,
+            read_any: bool,
+        },
+        /// Most likely unneeded, but left to make life easier, in case more clones are added.
+        Cloned {
+            len_was: usize,
+        }
+    }
+}
+
+impl<S> NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+{
+    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
+        use tokio_util::compat::TokioAsyncReadCompatExt;
+
+        let inner = tokio_util::io::StreamReader::new(inner).compat();
+        let inner = Some(inner);
+        let inner = std::sync::Mutex::new(inner);
+        NonSeekableStream::Initial { inner, len }
+    }
+}
+
+impl<S> std::fmt::Debug for NonSeekableStream<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
+            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
+            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
+        }
+    }
+}
+
+impl<S> futures::io::AsyncRead for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut [u8],
+    ) -> std::task::Poll<std::io::Result<usize>> {
+        match self.project() {
+            NonSeekableStreamProj::Actual {
+                inner, read_any, ..
+            } => {
+                *read_any = true;
+                inner.poll_read(cx, buf)
+            }
+            // NonSeekableStream::Initial does not support reading because it is just much easier
+            // to have the mutex in place where one does not poll the contents, or that's how it
+            // seemed originally. If there is a version upgrade which changes the cloning, then
+            // that support needs to be hacked in.
+            //
+            // including {self:?} into the message would be useful, but unsure how to unproject.
+            _ => std::task::Poll::Ready(Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "cloned or initial values cannot be read",
+            ))),
+        }
+    }
+}
+
+impl<S> Clone for NonSeekableStream<S> {
+    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
+    /// request, see type documentation.
+    fn clone(&self) -> Self {
+        use NonSeekableStream::*;
+
+        match self {
+            Initial { inner, len } => {
+                if let Some(inner) = inner.lock().unwrap().take() {
+                    Actual {
+                        inner,
+                        len: *len,
+                        read_any: false,
+                    }
+                } else {
+                    Self::Cloned { len_was: *len }
+                }
+            }
+            Actual { len, .. } => Cloned { len_was: *len },
+            Cloned { len_was } => Cloned { len_was: *len_was },
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl<S> azure_core::SeekableStream for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
+{
+    async fn reset(&mut self) -> azure_core::error::Result<()> {
+        use NonSeekableStream::*;
+
+        let msg = match self {
+            Initial { inner, .. } => {
+                if inner.get_mut().unwrap().is_some() {
+                    return Ok(());
+                } else {
+                    "reset after first clone is not supported"
+                }
+            }
+            Actual { read_any, .. } if !*read_any => return Ok(()),
+            Actual { .. } => "reset after reading is not supported",
+            Cloned { .. } => "reset after second clone is not supported",
+        };
+        Err(azure_core::error::Error::new(
+            azure_core::error::ErrorKind::Io,
+            std::io::Error::new(std::io::ErrorKind::Other, msg),
+        ))
+    }
+
+    // Note: it is not documented if this should be the total or remaining length, total passes the
+    // tests.
+    fn len(&self) -> usize {
+        use NonSeekableStream::*;
+        match self {
+            Initial { len, .. } => *len,
+            Actual { len, .. } => *len,
+            Cloned { len_was, .. } => *len_was,
+        }
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

+use bytes::Bytes;
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::sync::Semaphore;
 use toml_edit::Item;
 use tracing::info;

@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }

 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {

    pub async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,11 +7,14 @@
 use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::stream::Stream;
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
+use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -99,27 +102,35 @@ impl LocalFs {
        };

        // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the parent directory.  This is because
+        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-        match fs::metadata(full_path.clone()).await {
-            Ok(meta) => {
-                if !meta.is_dir() {
+        loop {
+            // Did we make it to the root?
+            if initial_dir.parent().is_none() {
+                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
+            }
+
+            match fs::metadata(initial_dir.clone()).await {
+                Ok(meta) if meta.is_dir() => {
+                    // We found a directory, break
+                    break;
+                }
+                Ok(_meta) => {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
-            }
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                initial_dir.pop();
-            }
-            Err(e) => {
-                // Unexpected I/O error
-                anyhow::bail!(e)
+                Err(e) if e.kind() == ErrorKind::NotFound => {
+                    // It's not a file that exists: strip the prefix back to the parent directory
+                    initial_dir.pop();
+                }
+                Err(e) => {
+                    // Unexpected I/O error
+                    anyhow::bail!(e)
+                }
            }
        }
-
        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
@@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs {

    async fn upload(
        &self,
-        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs {
        );

        let from_size_bytes = data_size_bytes as u64;
+        let data = tokio_util::io::StreamReader::new(data);
+        let data = std::pin::pin!(data);
        let mut buffer_to_read = data.take(from_size_bytes);

-        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
+        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
+        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
            .await
            .with_context(|| {
                format!(
@@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs {
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = io::BufReader::new(
+            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
@@ -340,16 +354,14 @@ impl RemoteStorage for LocalFs {
        }
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
@@ -363,11 +375,13 @@ impl RemoteStorage for LocalFs {
            Ok(match end_exclusive {
                Some(end_exclusive) => Download {
                    metadata,
-                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
                },
                None => Download {
                    metadata,
-                    download_stream: Box::pin(source),
+                    download_stream: Box::pin(ReaderStream::new(source)),
                },
            })
        } else {
@@ -467,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_assert_remote_file_contents(
@@ -477,7 +493,7 @@ mod fs_tests {
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
-        let mut download = storage
+        let download = storage
            .download(remote_storage_path)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -486,13 +502,9 @@ mod fs_tests {
            "Unexpected metadata returned for the downloaded file"
        );

-        let mut contents = String::new();
-        download
-            .download_stream
-            .read_to_string(&mut contents)
-            .await
-            .context("Failed to read remote file contents into string")?;
-        Ok(contents)
+        let contents = aggregate(download.download_stream).await?;
+
+        String::from_utf8(contents).map_err(anyhow::Error::new)
    }

    #[tokio::test]
@@ -521,25 +533,26 @@ mod fs_tests {
        let storage = create_storage()?;

        let id = RemotePath::new(Utf8Path::new("dummy"))?;
-        let content = std::io::Cursor::new(b"12345");
+        let content = Bytes::from_static(b"12345");
+        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));

        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(Box::new(content.clone()), 0, &id, None)
+            .upload(content(), 0, &id, None)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(Box::new(content.clone()), 4, &id, None)
+            .upload(content(), 4, &id, None)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(Box::new(content.clone()), 6, &id, None)
+            .upload(content(), 6, &id, None)
            .await
            .expect_err("upload with too large size succeeded");

        // Correct size is 5, this should succeed.
-        storage.upload(Box::new(content), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None).await?;

        Ok(())
    }
@@ -587,7 +600,7 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let mut first_part_download = storage
+        let first_part_download = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
@@ -595,21 +608,13 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut first_part_download.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(first_part_download.download_stream).await?;
        assert_eq!(
-            first_part_local,
-            first_part_remote.as_slice(),
+            first_part_local, first_part_remote,
            "First part bytes should be returned when requested"
        );

-        let mut second_part_download = storage
+        let second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -621,17 +626,9 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut second_part_download.download_stream,
-            &mut second_part_remote,
-        )
-        .await?;
-        second_part_remote.flush().await?;
-        let second_part_remote = second_part_remote.into_inner().into_inner();
+        let second_part_remote = aggregate(second_part_download.download_stream).await?;
        assert_eq!(
-            second_part_local,
-            second_part_remote.as_slice(),
+            second_part_local, second_part_remote,
            "Second part bytes should be returned when requested"
        );

@@ -721,17 +718,10 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let mut partial_download_with_metadata = storage
+        let partial_download_with_metadata = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut partial_download_with_metadata.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
@@ -807,16 +797,16 @@ mod fs_tests {
                )
            })?;

-        storage
-            .upload(Box::new(file), size, &relative_path, metadata)
-            .await?;
+        let file = tokio_util::io::ReaderStream::new(file);
+
+        storage.upload(file, size, &relative_path, metadata).await?;
        Ok(relative_path)
    }

    async fn create_file_for_upload(
        path: &Utf8Path,
        contents: &str,
-    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
+    ) -> anyhow::Result<(fs::File, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
        let mut file_for_writing = std::fs::OpenOptions::new()
            .write(true)
@@ -826,7 +816,7 @@ mod fs_tests {
        drop(file_for_writing);
        let file_size = path.metadata()?.len() as usize;
        Ok((
-            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
+            fs::OpenOptions::new().read(true).open(&path).await?,
            file_size,
        ))
    }
@@ -840,4 +830,16 @@ mod fs_tests {
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
+
+    async fn aggregate(
+        stream: impl Stream<Item = std::io::Result<Bytes>>,
+    ) -> anyhow::Result<Vec<u8>> {
+        use futures::stream::StreamExt;
+        let mut out = Vec::new();
+        let mut stream = std::pin::pin!(stream);
+        while let Some(res) = stream.next().await {
+            out.extend_from_slice(&res?[..]);
+        }
+        Ok(out)
+    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,9 +4,14 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::{borrow::Cow, sync::Arc};
+use std::{
+    borrow::Cow,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};

-use anyhow::Context;
+use anyhow::Context as _;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -28,11 +33,10 @@ use aws_smithy_async::rt::sleep::TokioSleep;

 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use bytes::Bytes;
+use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::io::{self, AsyncRead};
-use tokio_util::io::ReaderStream;
-use tracing::debug;

 use super::StorageMetadata;
 use crate::{
@@ -63,7 +67,7 @@ struct GetObjectRequest {
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        debug!(
+        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
@@ -225,12 +229,15 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
-                    ))),
+                    download_stream: Box::pin(body),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -243,29 +250,55 @@ impl S3Bucket {
    }
 }

+pin_project_lite::pin_project! {
+    struct ByteStreamAsStream {
+        #[pin]
+        inner: aws_smithy_types::byte_stream::ByteStream
+    }
+}
+
+impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
+    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
+        ByteStreamAsStream { inner }
+    }
+}
+
+impl Stream for ByteStreamAsStream {
+    type Item = std::io::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // this does the std::io::ErrorKind::Other conversion
+        self.project().inner.poll_next(cx).map_err(|x| x.into())
+    }
+
+    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
+    // sense and Stream::size_hint does not really
+}
+
 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
+    struct PermitCarrying<S> {
        permit: tokio::sync::OwnedSemaphorePermit,
        #[pin]
        inner: S,
    }
 }

-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+impl<S> PermitCarrying<S> {
    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
+        Self { permit, inner }
    }
 }

-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

@@ -285,7 +318,7 @@ pin_project_lite::pin_project! {
    }
 }

-impl<S: AsyncRead> TimedDownload<S> {
+impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
@@ -295,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
    }
 }

-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        use std::task::ready;
+
        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));

-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
+        let res = ready!(this.inner.poll_next(cx));
+        match &res {
+            Some(Ok(_)) => {}
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

-        std::task::Poll::Ready(read)
+        Poll::Ready(res)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

@@ -403,7 +437,7 @@ impl RemoteStorage for S3Bucket {

    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -413,7 +447,7 @@ impl RemoteStorage for S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let body = Body::wrap_stream(ReaderStream::new(from));
+        let body = Body::wrap_stream(from);
        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

        let res = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use bytes::Bytes;
+use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {

    async fn upload(
        &self,
-        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data = "remote blob data here".as_bytes();
-    let data_len = data.len() as u64;
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());

-    ctx.client
-        .upload(std::io::Cursor::new(data), data.len(), &path, None)
-        .await?;
+    let (data, len) = wrap_stream(orig.clone());

-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+    ctx.client.upload(data, len, &path, None).await?;
+
+    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
        let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        tokio::io::copy_buf(
+            &mut tokio_util::io::StreamReader::new(dl.download_stream),
+            &mut buf,
+        )
+        .await?;
        Ok(buf)
    }
    // Normal download request
    let dl = ctx.client.download(&path).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(data_len))
+        .download_byte_range(&path, 0, Some(len as u64))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    // partial range (end specified)
    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
+    assert_eq!(&buf, &orig[4..10]);

    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
+    assert_eq!(&buf, &orig[8..]);

    // Partial range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
+    assert_eq!(&buf, &orig[4..]);

    // Full range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
@@ -504,11 +500,8 @@ async fn upload_azure_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -589,11 +582,8 @@ async fn upload_simple_azure_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -622,3 +612,32 @@ async fn upload_simple_azure_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
+
+// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
+// to binary
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -432,11 +425,9 @@ async fn upload_s3_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -517,11 +508,9 @@ async fn upload_simple_s3_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -550,3 +539,30 @@ async fn upload_simple_s3_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
+
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,16 +1,14 @@
-use std::sync::Arc;
-
-use tokio::sync::{mpsc, Mutex};
+use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};

 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(mpsc::Sender<()>);
+pub struct Completion(TaskTrackerToken);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
-pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+pub struct Barrier(TaskTracker);

 impl Default for Barrier {
    fn default() -> Self {
@@ -21,7 +19,7 @@ impl Default for Barrier {

 impl Barrier {
    pub async fn wait(self) {
-        self.0.lock().await.recv().await;
+        self.0.wait().await;
    }

    pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -33,8 +31,7 @@ impl Barrier {

 impl PartialEq for Barrier {
    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
+        TaskTracker::ptr_eq(&self.0, &other.0)
    }
 }

@@ -42,8 +39,10 @@ impl Eq for Barrier {}

 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
-    let (tx, rx) = mpsc::channel::<()>(1);
-    let rx = Mutex::new(rx);
-    let rx = Arc::new(rx);
-    (Completion(tx), Barrier(rx))
+    let tracker = TaskTracker::new();
+    // otherwise wait never exits
+    tracker.close();
+
+    let token = tracker.token();
+    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new values without blocking
-//! readers. When you update the new value, the new value is immediately visible
+//! without blocking writers, and allows writing a new value without blocking
+//! readers. When you update the value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finishe, so that no one sees the old value anymore.
+//! finished, so that on return, no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,6 +26,7 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
+//! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -36,15 +37,17 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait();
+//! waitlist.wait().await;
+//! # }
 //! ```
 //!
 #![warn(missing_docs)]

 use std::ops::Deref;
-use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{Mutex, RwLock, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockWriteGuard};
+
+use tokio::sync::watch;

 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -68,22 +71,21 @@ struct RcuCell<V> {
    value: V,

    /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any cloned Senders will be notified
+    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
    /// that the channel is closed. Updaters can use this to wait out until the
    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
    ///
-    /// We never do anything with the receiver, we just need to hold onto it so
-    /// that the Senders will be notified when it's dropped. But because it's
-    /// not Sync, we need a Mutex on it.
-    watch: (SyncSender<()>, Mutex<Receiver<()>>),
+    /// We never send anything to this, we just need to hold onto it so that the
+    /// Receivers will be notified when it's dropped.
+    watch: watch::Sender<()>,
 }

 impl<V> RcuCell<V> {
    fn new(value: V) -> Self {
-        let (watch_sender, watch_receiver) = sync_channel(0);
+        let (watch_sender, _) = watch::channel(());
        RcuCell {
            value,
-            watch: (watch_sender, Mutex::new(watch_receiver)),
+            watch: watch_sender,
        }
    }
 }
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
-/// it should only be held for a short duration!
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
+/// held for a short duration!
 ///
-/// Calling `store` consumes the guard, making new reads and new writes possible
+/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
            // the watches for any that do.
            self.inner.old_cells.retain(|weak| {
                if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.0.clone());
+                    watches.push(cell.watch.subscribe());
                    true
                } else {
                    false
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<SyncSender<()>>);
+pub struct RcuWaitList(Vec<watch::Receiver<()>>);

 impl RcuWaitList {
    ///
    /// Wait for old readers to finish.
    ///
-    pub fn wait(mut self) {
+    pub async fn wait(mut self) {
        // after all the old_cells are no longer in use, we're done
        for w in self.0.iter_mut() {
            // This will block until the Receiver is closed. That happens when
            // the RcuCell is dropped.
            #[allow(clippy::single_match)]
-            match w.send(()) {
-                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
+            match w.changed().await {
+                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
                Err(_) => {
                    // closed, which means that the cell has been dropped, and
                    // its value is no longer in use
@@ -220,11 +222,10 @@ impl RcuWaitList {
 mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};
-    use std::thread::{sleep, spawn};
    use std::time::Duration;

-    #[test]
-    fn two_writers() {
+    #[tokio::test]
+    async fn two_writers() {
        let rcu = Rcu::new(1);

        let read1 = rcu.read();
@@ -248,33 +249,35 @@ mod tests {
        assert_eq!(*read1, 1);

        let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate threads.
+        // Wait for the old readers to finish in separate tasks.
        let log_clone = Arc::clone(&log);
-        let thread2 = spawn(move || {
-            wait2.wait();
+        let task2 = tokio::spawn(async move {
+            wait2.wait().await;
            log_clone.lock().unwrap().push("wait2 done");
        });
        let log_clone = Arc::clone(&log);
-        let thread3 = spawn(move || {
-            wait3.wait();
+        let task3 = tokio::spawn(async move {
+            wait3.wait().await;
            log_clone.lock().unwrap().push("wait3 done");
        });

        // without this sleep the test can pass on accident if the writer is slow
-        sleep(Duration::from_millis(500));
+        tokio::time::sleep(Duration::from_millis(100)).await;

        // Release first reader. This allows first write to finish, but calling
-        // wait() on the second one would still block.
+        // wait() on the 'task3' would still block.
        log.lock().unwrap().push("dropping read1");
        drop(read1);
-        thread2.join().unwrap();
+        task2.await.unwrap();

-        sleep(Duration::from_millis(500));
+        assert!(!task3.is_finished());
+
+        tokio::time::sleep(Duration::from_millis(100)).await;

        // Release second reader, and finish second writer.
        log.lock().unwrap().push("dropping read2");
        drop(read2);
-        thread3.join().unwrap();
+        task3.await.unwrap();

        assert_eq!(
            log.lock().unwrap().as_slice(),
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -30,18 +30,32 @@ async fn warn_if_stuck<Fut: std::future::Future>(

    let mut fut = std::pin::pin!(fut);

-    loop {
+    let mut warned = false;
+    let ret = loop {
        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => return ret,
+            Ok(ret) => break ret,
            Err(_) => {
                tracing::warn!(
                    gate = name,
                    elapsed_ms = started.elapsed().as_millis(),
                    "still waiting, taking longer than expected..."
                );
+                warned = true;
            }
        }
+    };
+
+    // If we emitted a warning for slowness, also emit a message when we complete, so that
+    // someone debugging a shutdown can know for sure whether we have moved past this operation.
+    if warned {
+        tracing::info!(
+            gate = name,
+            elapsed_ms = started.elapsed().as_millis(),
+            "completed, after taking longer than expected"
+        )
    }
+
+    ret
 }

 #[derive(Debug)]
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -139,6 +139,8 @@ where
    async fn send_tarball(mut self) -> anyhow::Result<()> {
        // TODO include checksum

+        let on_demand_slru_download = true; // TODO: should it be feature flag, config parameter or whatever else ?
+
        // Create pgdata subdirs structure
        for dir in PGDATA_SUBDIRS.iter() {
            let header = new_tar_header_dir(dir)?;
@@ -165,19 +167,20 @@ where
                    .context("could not add config file to basebackup tarball")?;
            }
        }
-
-        // Gather non-relational files from object storage pages.
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactOffsets,
-            SlruKind::MultiXactMembers,
-        ] {
-            for segno in self
-                .timeline
-                .list_slru_segments(kind, self.lsn, self.ctx)
-                .await?
-            {
-                self.add_slru_segment(kind, segno).await?;
+        if !on_demand_slru_download {
+            // Gather non-relational files from object storage pages.
+            for kind in [
+                SlruKind::Clog,
+                SlruKind::MultiXactOffsets,
+                SlruKind::MultiXactMembers,
+            ] {
+                for segno in self
+                    .timeline
+                    .list_slru_segments(kind, self.lsn, self.ctx)
+                    .await?
+                {
+                    self.add_slru_segment(kind, segno).await?;
+                }
            }
        }

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -425,7 +425,6 @@ fn start_pageserver(
    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx;
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -560,7 +559,6 @@ fn start_pageserver(
    }

    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let background_jobs_barrier = background_jobs_barrier;
        let metrics_ctx = RequestContext::todo_child(
            TaskKind::MetricsCollection,
            // This task itself shouldn't download anything.
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -269,24 +269,38 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if !tenant_shard_id.is_zero() {
+                // We only send consumption metrics from shard 0, so don't waste time calculating
+                // synthetic size on other shards.
+                continue;
+            }
+
+            if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    if let Some(PageReconstructError::Cancelled) =
-                        e.downcast_ref::<PageReconstructError>()
-                    {
-                        return Ok(());
+                    // this error can be returned if timeline is shutting down, but it does not
+                    // mean the synthetic size worker should terminate. we do not need any checks
+                    // in this function because `mgr::get_tenant` will error out after shutdown has
+                    // progressed to shutting down tenants.
+                    let is_cancelled = matches!(
+                        e.downcast_ref::<PageReconstructError>(),
+                        Some(PageReconstructError::Cancelled)
+                    );
+
+                    if !is_cancelled {
+                        error!(
+                            "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
+                        );
                    }
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
            }
        }
@@ -299,7 +313,7 @@ async fn calculate_synthetic_size_worker(

        let res = tokio::time::timeout_at(
            started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
        )
        .await;
        if res.is_ok() {
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics(
    };

    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active {
+        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
                .ok()
-                .map(|tenant| (id, tenant))
+                .map(|tenant| (id.tenant_id, tenant))
        }
    });

@@ -229,11 +228,6 @@ where
    while let Some((tenant_id, tenant)) = tenants.next().await {
        let mut tenant_resident_size = 0;

-        // Sharded tenants report all consumption metrics from shard zero
-        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
-            continue;
-        }
-
        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;

--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -312,7 +312,18 @@ impl ListWriter {
                for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
                    if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                        if attached_gen.previous() == tenant_list.generation {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Updating gen on recovered list");
                            tenant_list.generation = *attached_gen;
+                        } else {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Encountered stale generation on recovered list");
                        }
                    }
                }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -84,7 +84,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    get:
      description: Get tenant status
      responses:
@@ -181,7 +180,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    get:
      description: Get timelines for tenant
      responses:
@@ -232,7 +230,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -338,7 +335,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -401,7 +397,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -469,7 +464,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -523,7 +517,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    post:
      description: |
        Schedules attach operation to happen in the background for the given tenant.
@@ -631,7 +624,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: flush_ms
        in: query
        required: false
@@ -724,7 +716,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: detach_ignored
        in: query
        required: false
@@ -784,7 +775,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    post:
      description: |
        Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -833,7 +823,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    post:
      description: |
        Schedules an operation that attempts to load a tenant from the local disk and
@@ -890,7 +879,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    get:
      description: |
        Calculate tenant's synthetic size
@@ -933,7 +921,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
      - name: inputs_only
        in: query
        required: false
@@ -1003,7 +990,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    post:
      description: |
        Create a timeline. Returns new timeline id on success.\
@@ -1137,7 +1123,6 @@ paths:
            application/json:
              schema:
                type: string
-                format: hex
        "400":
          description: Malformed tenant create request
          content:
@@ -1234,7 +1219,6 @@ paths:
        required: true
        schema:
          type: string
-          format: hex
    get:
      description: |
        Returns tenant's config description: specific config overrides a tenant has
@@ -1340,7 +1324,6 @@ components:
          properties:
            new_tenant_id:
              type: string
-              format: hex
            generation:
              type: integer
              description: Attachment generation number.
@@ -1369,7 +1352,6 @@ components:
          properties:
            tenant_id:
              type: string
-              format: hex
    TenantLocationConfigRequest:
      type: object
      required:
@@ -1377,7 +1359,6 @@ components:
      properties:
        tenant_id:
          type: string
-          format: hex
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1446,7 +1427,6 @@ components:
          format: hex
        tenant_id:
          type: string
-          format: hex
        last_record_lsn:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -319,6 +319,7 @@ async fn build_timeline_info_common(
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+    let initdb_lsn = timeline.initdb_lsn;
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -352,14 +353,14 @@ async fn build_timeline_info_common(
    let walreceiver_status = timeline.walreceiver_status();

    let info = TimelineInfo {
-        // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
-        tenant_id: timeline.tenant_shard_id.tenant_id,
+        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
        remote_consistent_lsn: remote_consistent_lsn_projected,
        remote_consistent_lsn_visible,
+        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -480,15 +481,15 @@ async fn timeline_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -507,7 +508,9 @@ async fn timeline_list_handler(
        }
        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
-    .instrument(info_span!("timeline_list", %tenant_id))
+    .instrument(info_span!("timeline_list",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
    .await?;

    json_response(StatusCode::OK, response_data)
@@ -517,17 +520,17 @@ async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -544,7 +547,10 @@ async fn timeline_detail_handler(

        Ok::<_, ApiError>(timeline_info)
    }
-    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_detail",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
    .await?;

    json_response(StatusCode::OK, timeline_info)
@@ -554,8 +560,15 @@ async fn get_lsn_by_timestamp_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    if !tenant_shard_id.is_zero() {
+        // Requires SLRU contents, which are only stored on shard zero
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }

    let version: Option<u8> = parse_query_param(&request, "version")?;

@@ -567,7 +580,7 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -602,8 +615,15 @@ async fn get_timestamp_of_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    if !tenant_shard_id.is_zero() {
+        // Requires SLRU contents, which are only stored on shard zero
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }

    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;

@@ -613,7 +633,7 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -709,6 +729,26 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_reset_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);
+    state
+        .tenant_manager
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -785,11 +825,11 @@ async fn tenant_status(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -799,13 +839,15 @@ async fn tenant_status(

        let state = tenant.current_state();
        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_id,
+            id: tenant_shard_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
            attachment_status: state.attachment_status(),
        })
    }
-    .instrument(info_span!("tenant_status_handler", %tenant_id))
+    .instrument(info_span!("tenant_status_handler",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
    .await?;

    json_response(StatusCode::OK, tenant_info)
@@ -824,7 +866,7 @@ async fn tenant_delete_handler(
    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard = tenant_shard_id.shard_slug()
+            shard = %tenant_shard_id.shard_slug()
        ))
        .await?;

@@ -848,14 +890,20 @@ async fn tenant_size_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+
+    if !tenant_shard_id.is_zero() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }

    // this can be long operation
    let inputs = tenant
@@ -907,7 +955,7 @@ async fn tenant_size_handler(
    json_response(
        StatusCode::OK,
        TenantHistorySize {
-            id: tenant_id,
+            id: tenant_shard_id.tenant_id,
            size: sizes.as_ref().map(|x| x.total_size),
            segment_sizes: sizes.map(|x| x.segments),
            inputs,
@@ -919,14 +967,14 @@ async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);

-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
@@ -936,13 +984,12 @@ async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -953,7 +1000,7 @@ async fn layer_download_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -962,12 +1009,12 @@ async fn evict_timeline_layer_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;

-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -978,7 +1025,7 @@ async fn evict_timeline_layer_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -1110,10 +1157,10 @@ async fn get_tenant_config_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;

    let response = HashMap::from([
        (
@@ -1173,7 +1220,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard = tenant_shard_id.shard_slug()
+                    shard = %tenant_shard_id.shard_slug()
                ))
                .await
        {
@@ -1207,9 +1254,9 @@ async fn handle_tenant_break(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;

-    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1250,14 +1297,15 @@ async fn timeline_gc_handler(
    mut request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done =
+        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1272,9 +1320,9 @@ async fn timeline_compact_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1282,14 +1330,14 @@ async fn timeline_compact_handler(
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

@@ -1298,9 +1346,9 @@ async fn timeline_checkpoint_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1308,7 +1356,7 @@ async fn timeline_checkpoint_handler(
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
@@ -1320,7 +1368,7 @@ async fn timeline_checkpoint_handler(

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

@@ -1328,12 +1376,12 @@ async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1344,11 +1392,11 @@ async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;

-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1394,9 +1442,9 @@ async fn getpage_at_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    struct Key(crate::repository::Key);

@@ -1415,7 +1463,7 @@ async fn getpage_at_lsn_handler(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;

        let page = timeline.get(key.0, lsn, &ctx).await?;

@@ -1427,7 +1475,7 @@ async fn getpage_at_lsn_handler(
                .unwrap(),
        )
    }
-    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

@@ -1435,9 +1483,9 @@ async fn timeline_collect_keyspace(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    struct Partitioning {
        keys: crate::keyspace::KeySpace,
@@ -1506,7 +1554,7 @@ async fn timeline_collect_keyspace(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1515,15 +1563,15 @@ async fn timeline_collect_keyspace(

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
-    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

 async fn active_timeline_of_active_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1800,23 +1848,25 @@ pub fn make_router(
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
-        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .get("/v1/tenant/:tenant_shard_id", |r| {
+            api_handler(r, tenant_status)
+        })
        .delete("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_delete_handler)
        })
-        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
+        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
-        .get("/v1/tenant/:tenant_id/config", |r| {
+        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1828,53 +1878,59 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
+        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
+            api_handler(r, tenant_reset_handler)
+        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
-        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            api_handler(r, timeline_gc_handler)
-        })
-        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
-            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
-        })
        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
+            |r| api_handler(r, timeline_gc_handler),
+        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            api_handler(r, layer_map_info_handler)
-        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
+            |r| api_handler(r, layer_map_info_handler),
+        )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, layer_download_handler),
        )
        .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
        .put("/v1/disk_usage_eviction/run", |r| {
@@ -1883,18 +1939,19 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .put("/v1/tenant/:tenant_id/break", |r| {
+        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
-            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
-        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
+            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+        )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
        )
        .any(handler_404))
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,9 +2,8 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
+use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
-use std::pin::Pin;
-use std::task::{self, Poll};

 use anyhow::{bail, ensure, Context, Result};
 use async_compression::tokio::bufread::ZstdDecoder;
@@ -13,7 +12,8 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+use tokio::fs::{File, OpenOptions};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -629,70 +629,16 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    Ok(Bytes::from(buf))
 }

-/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
-///
-/// The number of yields is bounded by above by the number of times poll_write is called,
-/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
-/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
-/// breathing room between units of CPU intensive preparation of buffers to be written.
-/// Once a write call is issued, the whole buffer has been prepared already, so there is no
-/// gain in splitting up the memcopy further.
-struct YieldingVec {
-    yield_budget: usize,
-    // the buffer written into
-    buf: Vec<u8>,
-}
+pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tmp_path)
+        .await
+        .with_context(|| format!("tempfile creation {tmp_path}"))?;

-impl YieldingVec {
-    fn new() -> Self {
-        Self {
-            yield_budget: 0,
-            buf: Vec::new(),
-        }
-    }
-    // Whether we should yield for a read operation of given size
-    fn should_yield(&mut self, add_buf_len: usize) -> bool {
-        // Set this limit to a small value so that we are a
-        // good async citizen and yield repeatedly (but not
-        // too often for many small writes to cause many yields)
-        const YIELD_DIST: usize = 1024;
-
-        let target_buf_len = self.buf.len() + add_buf_len;
-        let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
-        if self.yield_budget < target_buf_len {
-            self.yield_budget += add_buf_len;
-        }
-        ret
-    }
-}
-
-impl AsyncWrite for YieldingVec {
-    fn poll_write(
-        mut self: Pin<&mut Self>,
-        cx: &mut task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<std::io::Result<usize>> {
-        if self.should_yield(buf.len()) {
-            cx.waker().wake_by_ref();
-            return Poll::Pending;
-        }
-        self.get_mut().buf.extend_from_slice(buf);
-        Poll::Ready(Ok(buf.len()))
-    }
-
-    fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
-        Poll::Ready(Ok(()))
-    }
-
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        _cx: &mut task::Context<'_>,
-    ) -> Poll<std::io::Result<()>> {
-        Poll::Ready(Ok(()))
-    }
-}
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    let mut paths = Vec::new();
    for entry in WalkDir::new(pgdata_path) {
        let entry = entry?;
@@ -707,7 +653,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
-        YieldingVec::new(),
+        file,
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
@@ -725,13 +671,14 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
-    let compressed = zstd.into_inner();
-    let compressed_len = compressed.buf.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
    }
-    Ok(compressed.buf)
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
 }

 pub async fn extract_tar_zst(
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -205,7 +205,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -213,7 +213,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -222,7 +222,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -285,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

+pub(crate) mod page_cache_eviction_metrics {
+    use std::num::NonZeroUsize;
+
+    use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
+    use once_cell::sync::Lazy;
+
+    #[derive(Clone, Copy)]
+    pub(crate) enum Outcome {
+        FoundSlotUnused { iters: NonZeroUsize },
+        FoundSlotEvicted { iters: NonZeroUsize },
+        ItersExceeded { iters: NonZeroUsize },
+    }
+
+    static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_page_cache_find_victim_iters_total",
+            "Counter for the number of iterations in the find_victim loop",
+            &["outcome"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_page_cache_find_victim_calls",
+            "Incremented at the end of each find_victim() call.\
+             Filter by outcome to get e.g., eviction rate.",
+            &["outcome"]
+        )
+        .unwrap()
+    });
+
+    pub(crate) fn observe(outcome: Outcome) {
+        macro_rules! dry {
+            ($label:literal, $iters:expr) => {{
+                static LABEL: &'static str = $label;
+                static ITERS_TOTAL: Lazy<IntCounter> =
+                    Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
+                static CALLS: Lazy<IntCounter> =
+                    Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
+                ITERS_TOTAL.inc_by(($iters.get()) as u64);
+                CALLS.inc();
+            }};
+        }
+        match outcome {
+            Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
+            Outcome::FoundSlotEvicted { iters } => {
+                dry!("found_evicted", iters)
+            }
+            Outcome::ItersExceeded { iters } => {
+                dry!("err_iters_exceeded", iters);
+                super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
+            }
+        }
+    }
+}
+
 pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -294,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_page_cache_find_victim_iters_total",
-        "Counter for the number of iterations in the find_victim loop",
-    )
-    .expect("failed to define a metric")
-});
-
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -601,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
        "pageserver_evictions_with_low_residence_duration",
        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
         Residence duration is determined using the `residence_duration_data_source`.",
-        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+        &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
    )
    .expect("failed to define a metric")
 });
@@ -665,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder {
        }
    }

-    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+    fn build(
+        &self,
+        tenant_id: &str,
+        shard_id: &str,
+        timeline_id: &str,
+    ) -> EvictionsWithLowResidenceDuration {
        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
            .get_metric_with_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                self.data_source,
                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -699,21 +754,24 @@ impl EvictionsWithLowResidenceDuration {
    pub fn change_threshold(
        &mut self,
        tenant_id: &str,
+        shard_id: &str,
        timeline_id: &str,
        new_threshold: Duration,
    ) {
        if new_threshold == self.threshold {
            return;
        }
-        let mut with_new =
-            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
-                .build(tenant_id, timeline_id);
+        let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
+            self.data_source,
+            new_threshold,
+        )
+        .build(tenant_id, shard_id, timeline_id);
        std::mem::swap(self, &mut with_new);
-        with_new.remove(tenant_id, timeline_id);
+        with_new.remove(tenant_id, shard_id, timeline_id);
    }

    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
-    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+    fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
        let Some(_counter) = self.counter.take() else {
            return;
        };
@@ -722,6 +780,7 @@ impl EvictionsWithLowResidenceDuration {

        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
            tenant_id,
+            shard_id,
            timeline_id,
            self.data_source,
            &threshold,
@@ -774,6 +833,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 )]
 pub(crate) enum StorageIoOperation {
    Open,
+    OpenAfterReplace,
    Close,
    CloseByReplace,
    Read,
@@ -787,6 +847,7 @@ impl StorageIoOperation {
    pub fn as_str(&self) -> &'static str {
        match self {
            StorageIoOperation::Open => "open",
+            StorageIoOperation::OpenAfterReplace => "open-after-replace",
            StorageIoOperation::Close => "close",
            StorageIoOperation::CloseByReplace => "close-by-replace",
            StorageIoOperation::Read => "read",
@@ -841,6 +902,25 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) mod virtual_file_descriptor_cache {
+    use super::*;
+
+    pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
+        register_uint_gauge!(
+            "pageserver_virtual_file_descriptor_cache_size_max",
+            "Maximum number of open file descriptors in the cache."
+        )
+        .unwrap()
+    });
+
+    // SIZE_CURRENT: derive it like so:
+    // ```
+    // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
+    // -ignoring(operation)
+    // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
+    // ```
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
@@ -881,6 +961,7 @@ pub enum SmgrQueryType {
    GetRelSize,
    GetPageAtLsn,
    GetDbSize,
+    GetSlruSegment,
 }

 #[derive(Debug)]
@@ -950,6 +1031,7 @@ mod smgr_query_time_tests {
            (GetRelSize, "get_rel_size"),
            (GetPageAtLsn, "get_page_at_lsn"),
            (GetDbSize, "get_db_size"),
+            (GetSlruSegment, "get_slru_segment"),
        ];
        for (op, expect) in expect {
            let actual: &'static str = op.into();
@@ -1176,7 +1258,7 @@ pub(crate) struct WalIngestMetrics {
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeeper"
+        "Number of WAL records received from safekeepers"
    )
    .expect("failed to define a metric"),
    records_committed: register_int_counter!(
@@ -1533,6 +1615,7 @@ impl StorageTimeMetrics {
 #[derive(Debug)]
 pub struct TimelineMetrics {
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
@@ -1553,11 +1636,12 @@ pub struct TimelineMetrics {

 impl TimelineMetrics {
    pub fn new(
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
    ) -> Self {
-        let tenant_id = tenant_id.to_string();
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1594,11 +1678,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let evictions_with_low_residence_duration =
-            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
+        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
+            .build(&tenant_id, &shard_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
+            shard_id,
            timeline_id,
            flush_time_histo,
            compact_time_histo,
@@ -1644,6 +1729,7 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1657,7 +1743,7 @@ impl Drop for TimelineMetrics {
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
-            .remove(tenant_id, timeline_id);
+            .remove(tenant_id, shard_id, timeline_id);

        // The following metrics are born outside of the TimelineMetrics lifecycle but still
        // removed at the end of it. The idea is to have the metrics outlive the
@@ -2118,6 +2204,8 @@ pub fn preinitialize_metrics() {
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);

+    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -28,7 +28,7 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
 //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
 //!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,12 +83,14 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use pageserver_api::shard::TenantShardId;
+use utils::{id::TimelineId, lsn::Lsn};

-use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
+use crate::{
+    context::RequestContext,
+    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    repository::Key,
+};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -150,7 +152,13 @@ enum CacheKey {

 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
 struct MaterializedPageHashKey {
-    tenant_id: TenantId,
+    /// Why is this TenantShardId rather than TenantId?
+    ///
+    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
+    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
+    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
+    /// special-cased in some other way.
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    key: Key,
 }
@@ -374,7 +382,7 @@ impl PageCache {
    /// returned page.
    pub async fn lookup_materialized_page(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key: &Key,
        lsn: Lsn,
@@ -391,7 +399,7 @@ impl PageCache {

        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                key: *key,
            },
@@ -432,7 +440,7 @@ impl PageCache {
    ///
    pub async fn memorize_materialized_page(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key: Key,
        lsn: Lsn,
@@ -440,7 +448,7 @@ impl PageCache {
    ) -> anyhow::Result<()> {
        let cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                key,
            },
@@ -897,8 +905,10 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            page_cache_eviction_metrics::observe(
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
+                                    iters: iters.try_into().unwrap(),
+                                },
                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
@@ -909,8 +919,18 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                } else {
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
                }
-                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,7 +19,8 @@ use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamNblocksRequest, PagestreamNblocksResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse,
 };
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
@@ -64,12 +65,13 @@ use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

 /// Read the end of a tar archive.
 ///
@@ -518,6 +520,16 @@ impl PageServerHandler {
                        span,
                    )
                }
+                PagestreamFeMessage::GetSlruSegment(req) => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetSlruSegment);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_slru_segment_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
+                }
            };

            if let Err(e) = &response {
@@ -862,6 +874,25 @@ impl PageServerHandler {
        }))
    }

+    async fn handle_get_slru_segment_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetSlruSegmentRequest,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<PagestreamBeMessage> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+
+        let kind = SlruKind::from_repr(req.kind).ok_or(anyhow::anyhow!("invalid SLRU kind"))?;
+        let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
+
+        Ok(PagestreamBeMessage::GetSlruSegment(
+            PagestreamGetSlruSegmentResponse { segment },
+        ))
+    }
+
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
-use bytes::{Buf, Bytes};
+use bytes::{Buf, Bytes, BytesMut};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -314,6 +314,25 @@ impl Timeline {
        }
    }

+    /// Get the whole SLRU segment
+    pub async fn get_slru_segment(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        let n_blocks = self.get_slru_segment_size(kind, segno, lsn, ctx).await?;
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for blkno in 0..n_blocks {
+            let block = self
+                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
+                .await?;
+            segment.extend_from_slice(&block[..BLCKSZ as usize]);
+        }
+        Ok(segment.freeze())
+    }
+
    /// Look up given SLRU page version.
    pub async fn get_slru_page_at_lsn(
        &self,
@@ -822,10 +841,7 @@ impl<'a> DatadirModification<'a> {
        self.put(DBDIR_KEY, Value::Image(buf.into()));

        // Create AuxFilesDirectory
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+        self.init_aux_dir()?;

        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
@@ -933,10 +949,7 @@ impl<'a> DatadirModification<'a> {
            self.put(DBDIR_KEY, Value::Image(buf.into()));

            // Create AuxFilesDirectory as well
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: HashMap::new(),
-            })?;
-            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            self.init_aux_dir()?;
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1261,6 +1274,14 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+        Ok(())
+    }
+
    pub async fn put_file(
        &mut self,
        path: &str,
@@ -1767,6 +1788,13 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};

 use futures::FutureExt;
+use pageserver_api::shard::TenantShardId;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
@@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn};

 use once_cell::sync::Lazy;

-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 use crate::shutdown_pageserver;

@@ -317,7 +318,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -329,7 +330,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
    shutdown_process_on_error: bool,
@@ -345,7 +346,7 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        mutable: Mutex::new(MutableTaskState { join_handle: None }),
    });
@@ -424,28 +425,28 @@ async fn task_finish(
            Ok(Err(err)) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                }
            }
            Err(err) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                }
            }
@@ -467,11 +468,11 @@ async fn task_finish(
 ///
 /// Or to shut down all tasks for given timeline:
 ///
-///   shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
+///   shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
 ///
 pub async fn shutdown_tasks(
    kind: Option<TaskKind>,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
 ) {
    let mut victim_tasks = Vec::new();
@@ -480,35 +481,35 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
+                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_id,
+                    task.tenant_shard_id,
                    task.timeline_id,
                ));
            }
        }
    }

-    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+    let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();

-    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
+    for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                if tenant_id.is_none() {
+                if tenant_shard_id.is_none() {
                    // there are quite few of these
                    info!(name = task.name, kind = ?task_kind, "stopping global task");
                } else {
                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -517,12 +518,13 @@ pub async fn shutdown_tasks(
            {
                // allow some time to elapse before logging to cut down the number of log
                // lines.
-                info!("waiting for {} to shut down", task.name);
+                info!("waiting for task {} to shut down", task.name);
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
                // - task errors are already logged in the wrapper
                let _ = join_handle.await;
+                info!("task {} completed", task.name);
            }
        } else {
            // Possibly one of:
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,6 @@
 //!

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
@@ -69,6 +68,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
@@ -608,7 +608,7 @@ impl Tenant {
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
            None,
            "attach tenant",
            false,
@@ -1917,7 +1917,7 @@ impl Tenant {
        //
        // this will additionally shutdown and await all timeline tasks.
        tracing::debug!("Waiting for tasks...");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await;
+        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;

        // Wait for any in-flight operations to complete
        self.gate.close().await;
@@ -2515,7 +2515,7 @@ impl Tenant {
            }
        }

-        info!("persisting tenantconf to {config_path}");
+        debug!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2550,7 +2550,7 @@ impl Tenant {
        target_config_path: &Utf8Path,
        tenant_conf: &TenantConfOpt,
    ) -> anyhow::Result<()> {
-        info!("persisting tenantconf to {target_config_path}");
+        debug!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2949,10 +2949,10 @@ impl Tenant {
        };
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.
+
+        let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
        let pgdata_path = path_with_suffix_extension(
-            self.conf
-                .timelines_path(&self.tenant_shard_id)
-                .join(format!("basebackup-{timeline_id}")),
+            timelines_path.join(format!("basebackup-{timeline_id}")),
            TEMP_FILE_SUFFIX,
        );

@@ -2983,31 +2983,43 @@ impl Tenant {
                )
                .await
                .context("download initdb tar")?;
-            let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
+            let buf_read =
+                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;

-            if initdb_tar_zst_path.exists() {
-                tokio::fs::remove_file(&initdb_tar_zst_path)
-                    .await
-                    .context("tempfile removal")?;
-            }
+            tokio::fs::remove_file(&initdb_tar_zst_path)
+                .await
+                .or_else(|e| {
+                    if e.kind() == std::io::ErrorKind::NotFound {
+                        // If something else already removed the file, ignore the error
+                        Ok(())
+                    } else {
+                        Err(e)
+                    }
+                })
+                .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
        } else {
            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
            if let Some(storage) = &self.remote_storage {
-                let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
-                let pgdata_zstd = Bytes::from(pgdata_zstd);
+                let temp_path = timelines_path.join(format!(
+                    "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
+                ));
+
+                let (pgdata_zstd, tar_zst_size) =
+                    import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
                backoff::retry(
                    || async {
                        self::remote_timeline_client::upload_initdb_dir(
                            storage,
                            &self.tenant_shard_id.tenant_id,
                            &timeline_id,
-                            pgdata_zstd.clone(),
+                            pgdata_zstd.try_clone().await?,
+                            tar_zst_size,
                        )
                        .await
                    },
@@ -3019,6 +3031,18 @@ impl Tenant {
                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
                )
                .await?;
+
+                tokio::fs::remove_file(&temp_path)
+                    .await
+                    .or_else(|e| {
+                        if e.kind() == std::io::ErrorKind::NotFound {
+                            // If something else already removed the file, ignore the error
+                            Ok(())
+                        } else {
+                            Err(e)
+                        }
+                    })
+                    .with_context(|| format!("tempfile removal {temp_path}"))?;
            }
        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -77,8 +77,10 @@ async fn create_remote_delete_mark(
    let data: &[u8] = &[];
    backoff::retry(
        || async {
+            let data = bytes::Bytes::from_static(data);
+            let stream = futures::stream::once(futures::future::ready(Ok(data)));
            remote_storage
-                .upload(data, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None)
                .await
        },
        |_e| false,
@@ -461,7 +463,7 @@ impl DeleteTenantFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
            None,
            "tenant_delete",
            false,
@@ -548,7 +550,7 @@ impl DeleteTenantFlow {
                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
                let barrier = {
                    let mut locked = tenants.write().unwrap();
-                    let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
+                    let removed = locked.remove(tenant.tenant_shard_id);

                    // FIXME: we should not be modifying this from outside of mgr.rs.
                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -98,33 +98,6 @@ pub(crate) enum TenantsMap {
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }

-/// Helper for mapping shard-unaware functions to a sharding-aware map
-/// TODO(sharding): all users of this must be made shard-aware.
-fn exactly_one_or_none<'a>(
-    map: &'a BTreeMap<TenantShardId, TenantSlot>,
-    tenant_id: &TenantId,
-) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
-    let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
-
-    // Retrieve the first two slots in the range: if both are populated, we must panic because the caller
-    // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
-    let slot_a = slots.next();
-    let slot_b = slots.next();
-    match (slot_a, slot_b) {
-        (None, None) => None,
-        (Some(slot), None) => {
-            // Exactly one matching slot
-            Some(slot)
-        }
-        (Some(_slot_a), Some(_slot_b)) => {
-            // Multiple shards for this tenant: cannot handle this yet.
-            // TODO(sharding): callers of get() should be shard-aware.
-            todo!("Attaching multiple shards in teh same tenant to the same pageserver")
-        }
-        (None, Some(_)) => unreachable!(),
-    }
-}
-
 pub(crate) enum TenantsMapRemoveResult {
    Occupied(TenantSlot),
    Vacant,
@@ -147,12 +120,11 @@ impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                // TODO(sharding): callers of get() should be shard-aware.
-                exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
+                m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
            }
        }
    }
@@ -204,25 +176,19 @@ impl TenantsMap {
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
    /// slot if the enclosed tenant is shutdown.
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
+    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
        use std::collections::btree_map::Entry;
        match self {
            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
-                match key {
-                    Some(key) => match m.entry(key) {
-                        Entry::Occupied(entry) => match entry.get() {
-                            TenantSlot::InProgress(barrier) => {
-                                TenantsMapRemoveResult::InProgress(barrier.clone())
-                            }
-                            _ => TenantsMapRemoveResult::Occupied(entry.remove()),
-                        },
-                        Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
-                    },
-                    None => TenantsMapRemoveResult::Vacant,
-                }
-            }
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
+                Entry::Occupied(entry) => match entry.get() {
+                    TenantSlot::InProgress(barrier) => {
+                        TenantsMapRemoveResult::InProgress(barrier.clone())
+                    }
+                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
+                },
+                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
+            },
        }
    }

@@ -270,49 +236,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

-/// Create a directory, including parents.  This does no fsyncs and makes
-/// no guarantees about the persistence of the resulting metadata: for
-/// use when creating dirs for use as cache.
-async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
-    let mut dirs_to_create = Vec::new();
-    let mut path: &Utf8Path = path.as_ref();
-
-    // Figure out which directories we need to create.
-    loop {
-        let meta = tokio::fs::metadata(path).await;
-        match meta {
-            Ok(metadata) if metadata.is_dir() => break,
-            Ok(_) => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {path}"),
-                ));
-            }
-            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(e),
-        }
-
-        dirs_to_create.push(path);
-
-        match path.parent() {
-            Some(parent) => path = parent,
-            None => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{path}'"),
-                ));
-            }
-        }
-    }
-
-    // Create directories from parent to child.
-    for &path in dirs_to_create.iter().rev() {
-        tokio::fs::create_dir(path).await?;
-    }
-
-    Ok(())
-}
-
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -646,7 +569,13 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!("Attaching tenant {tenant_shard_id}");
+    info!(
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        generation = ?location_conf.location.generation,
+        attach_mode = ?location_conf.location.attach_mode,
+        "Attaching tenant"
+    );
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -859,14 +788,16 @@ pub(crate) async fn set_new_tenant_config(
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
 ) -> Result<(), SetNewTenantConfigError> {
+    // Legacy API: does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_id, true)?;
+    let tenant = get_tenant(tenant_shard_id, true)?;

    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
@@ -1035,7 +966,7 @@ impl TenantManager {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                unsafe_create_dir_all(&tenant_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

@@ -1051,7 +982,7 @@ impl TenantManager {
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                unsafe_create_dir_all(&timelines_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

@@ -1081,6 +1012,81 @@ impl TenantManager {

        Ok(())
    }
+
+    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
+    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
+    /// dropped before re-attaching.
+    ///
+    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
+    /// where an issue is identified that would go away with a restart of the tenant.
+    ///
+    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
+    /// to respect the cancellation tokens used in normal shutdown().
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
+    pub(crate) async fn reset_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        drop_cache: bool,
+        ctx: RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let Some(old_slot) = slot_guard.get_old_value() else {
+            anyhow::bail!("Tenant not found when trying to reset");
+        };
+
+        let Some(tenant) = old_slot.get_attached() else {
+            slot_guard.revert();
+            anyhow::bail!("Tenant is not in attached state");
+        };
+
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, false).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
+            }
+            Err(_barrier) => {
+                slot_guard.revert();
+                anyhow::bail!("Cannot reset Tenant, already shutting down");
+            }
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        if drop_cache {
+            tracing::info!("Dropping local file cache");
+
+            match tokio::fs::read_dir(&timelines_path).await {
+                Err(e) => {
+                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
+                }
+                Ok(mut entries) => {
+                    while let Some(entry) = entries.next_entry().await? {
+                        tokio::fs::remove_dir_all(entry.path()).await?;
+                    }
+                }
+            }
+        }
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            self.tenants,
+            SpawnMode::Normal,
+            &ctx,
+        )?;
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(())
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1105,14 +1111,11 @@ pub(crate) enum GetTenantError {
 ///
 /// This method is cancel-safe.
 pub(crate) fn get_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
    let locked = TENANTS.read().unwrap();

-    // TODO(sharding): make all callers of get_tenant shard-aware
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

    match peek_slot {
@@ -1124,14 +1127,18 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
-        None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
+        Some(TenantSlot::InProgress(_)) => {
+            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+        }
+        None | Some(TenantSlot::Secondary) => {
+            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+        }
    }
 }

@@ -1289,8 +1296,7 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let mut slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1505,7 +1511,8 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
+{
    let tenants = TENANTS.read().unwrap();
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1513,12 +1520,10 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
-        // TODO(sharding): make callers of this function shard-aware
-        .map(|(k, v)| (k.tenant_id, v))
        .collect())
 }

@@ -1617,9 +1622,10 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
+    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1679,9 +1685,9 @@ impl SlotGuard {
        }
    }

-    /// Take any value that was present in the slot before we acquired ownership
+    /// Get any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&mut self) -> &Option<TenantSlot> {
+    fn get_old_value(&self) -> &Option<TenantSlot> {
        &self.old_value
    }

@@ -1899,7 +1905,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
@@ -2051,21 +2057,19 @@ use {
 };

 pub(crate) async fn immediate_gc(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
    cancel: CancellationToken,
    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
    let guard = TENANTS.read().unwrap();
-    let tenant = guard
-        .get(&tenant_id)
-        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;

-    // TODO(sharding): make callers of this function shard-aware
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let tenant = guard
+        .get(&tenant_shard_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_shard_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -2078,9 +2082,9 @@ pub(crate) async fn immediate_gc(
    task_mgr::spawn(
        &tokio::runtime::Handle::current(),
        TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
+        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
        false,
        async move {
            fail::fail_point!("immediate_gc_task_pre");
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -254,6 +254,9 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";

+/// Default buffer size when interfacing with [`tokio::fs::File`].
+pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
+
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -1220,7 +1223,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id.tenant_id),
+                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
                false,
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -75,12 +75,11 @@ pub async fn download_layer_file<'a>(

    let (mut destination_file, bytes_amount) = download_retry(
        || async {
-            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path)
+            let destination_file = tokio::fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
-            let mut download = storage
+            let download = storage
                .download(&remote_path)
                .await
                .with_context(|| {
@@ -90,9 +89,14 @@ pub async fn download_layer_file<'a>(
                })
                .map_err(DownloadError::Other)?;

+            let mut destination_file =
+                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
+
+            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
            let bytes_amount = tokio::time::timeout(
                MAX_DOWNLOAD_DURATION,
-                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
@@ -103,6 +107,8 @@ pub async fn download_layer_file<'a>(
            })
            .map_err(DownloadError::Other)?;

+            let destination_file = destination_file.into_inner();
+
            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
@@ -220,20 +226,22 @@ async fn do_download_index_part(
    index_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
+    use futures::stream::StreamExt;
+
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
        || async {
-            let mut index_part_download = storage.download(&remote_path).await?;
+            let index_part_download = storage.download(&remote_path).await?;

            let mut index_part_bytes = Vec::new();
-            tokio::io::copy(
-                &mut index_part_download.download_stream,
-                &mut index_part_bytes,
-            )
-            .await
-            .with_context(|| format!("download index part at {remote_path:?}"))
-            .map_err(DownloadError::Other)?;
+            let mut stream = std::pin::pin!(index_part_download.download_stream);
+            while let Some(chunk) = stream.next().await {
+                let chunk = chunk
+                    .with_context(|| format!("download index part at {remote_path:?}"))
+                    .map_err(DownloadError::Other)?;
+                index_part_bytes.extend_from_slice(&chunk[..]);
+            }
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
@@ -394,11 +402,13 @@ pub(crate) async fn download_initdb_tar_zst(
            .with_context(|| format!("timeline dir creation {timeline_path}"))
            .map_err(DownloadError::Other)?;
    }
-    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
+    let temp_path = timeline_path.join(format!(
+        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
+    ));

    let file = download_retry(
        || async {
-            let mut file = OpenOptions::new()
+            let file = OpenOptions::new()
                .create(true)
                .truncate(true)
                .read(true)
@@ -408,13 +418,17 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let mut download = storage.download(&remote_path).await?;
+            let download = storage.download(&remote_path).await?;
+            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
+            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

-            tokio::io::copy(&mut download.download_stream, &mut file)
+            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
                .map_err(DownloadError::Other)?;

+            let mut file = writer.into_inner();
+
            file.seek(std::io::SeekFrom::Start(0))
                .await
                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
@@ -426,10 +440,10 @@ pub(crate) async fn download_initdb_tar_zst(
    )
    .await
    .map_err(|e| {
-        if temp_path.exists() {
-            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
-            // We don't have async here nor do we want to pile on any extra errors.
-            if let Err(e) = std::fs::remove_file(&temp_path) {
+        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
+        // We don't have async here nor do we want to pile on any extra errors.
+        if let Err(e) = std::fs::remove_file(&temp_path) {
+            if e.kind() != std::io::ErrorKind::NotFound {
                warn!("error deleting temporary file {temp_path}: {e}");
            }
        }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,12 +1,11 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::ErrorKind;
-use tokio::fs;
+use tokio::fs::{self, File};

 use super::Generation;
 use crate::{
@@ -41,11 +40,15 @@ pub(super) async fn upload_index_part<'a>(
        .to_s3_bytes()
        .context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
-    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
-        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
+            index_part_size,
+            &remote_path,
+        )
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
@@ -101,8 +104,10 @@ pub(super) async fn upload_timeline_layer<'a>(
    let fs_size = usize::try_from(fs_size)
        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;

+    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
+
    storage
-        .upload(source_file, fs_size, &storage_path, None)
+        .upload(reader, fs_size, &storage_path, None)
        .await
        .with_context(|| format!("upload layer from local path '{source_path}'"))?;

@@ -114,16 +119,16 @@ pub(crate) async fn upload_initdb_dir(
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-    initdb_dir: Bytes,
+    initdb_tar_zst: File,
+    size: u64,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading initdb dir");

-    let size = initdb_dir.len();
-    let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
+    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
    storage
-        .upload_storage_object(bytes, size, &remote_path)
+        .upload_storage_object(file, size as usize, &remote_path)
        .await
        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,7 +4,7 @@ pub mod delta_layer;
 mod filename;
 pub mod image_layer;
 mod inmemory_layer;
-mod layer;
+pub(crate) mod layer;
 mod layer_desc;

 use crate::context::{AccessStatsBehavior, RequestContext};
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -222,8 +222,8 @@ impl Layer {
    ///
    /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
    /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
-    pub(crate) fn garbage_collect_on_drop(&self) {
-        self.0.garbage_collect_on_drop();
+    pub(crate) fn delete_on_drop(&self) {
+        self.0.delete_on_drop();
    }

    /// Return data needed to reconstruct given page at LSN.
@@ -331,10 +331,10 @@ impl Layer {
        Ok(())
    }

-    /// Waits until this layer has been dropped (and if needed, local garbage collection and remote
+    /// Waits until this layer has been dropped (and if needed, local file deletion and remote
    /// deletion scheduling has completed).
    ///
-    /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
+    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
    /// separatedly.
    #[cfg(feature = "testing")]
    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -423,8 +423,8 @@ struct LayerInner {
    /// Initialization and deinitialization are done while holding a permit.
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,

-    /// Do we want to garbage collect this when `LayerInner` is dropped
-    wanted_garbage_collected: AtomicBool,
+    /// Do we want to delete locally and remotely this when `LayerInner` is dropped
+    wanted_deleted: AtomicBool,

    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -438,10 +438,6 @@ struct LayerInner {
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
-    ///
-    /// If in future we need to implement "wait until layer instances are gone and done", carrying
-    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
-    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -483,14 +479,14 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        if !*self.wanted_garbage_collected.get_mut() {
+        if !*self.wanted_deleted.get_mut() {
            // should we try to evict if the last wish was for eviction?
            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
            // run drops during shutdown (yet)
            return;
        }

-        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
+        let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().filename();
@@ -517,8 +513,8 @@ impl Drop for LayerInner {
                    false
                }
                Err(e) => {
-                    tracing::error!("failed to remove garbage collected layer: {e}");
-                    LAYER_IMPL_METRICS.inc_gc_removes_failed();
+                    tracing::error!("failed to remove wanted deleted layer: {e}");
+                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
                    false
                }
            };
@@ -540,15 +536,15 @@ impl Drop for LayerInner {
                        } else {
                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
                        }
-                        LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
+                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
                    } else {
-                        LAYER_IMPL_METRICS.inc_completed_gcs();
+                        LAYER_IMPL_METRICS.inc_completed_deletes();
                    }
                }
            } else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
            }
        });
    }
@@ -583,7 +579,7 @@ impl LayerInner {
            timeline: Arc::downgrade(timeline),
            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
-            wanted_garbage_collected: AtomicBool::new(false),
+            wanted_deleted: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
@@ -594,16 +590,13 @@ impl LayerInner {
        }
    }

-    fn garbage_collect_on_drop(&self) {
-        let res = self.wanted_garbage_collected.compare_exchange(
-            false,
-            true,
-            Ordering::Release,
-            Ordering::Relaxed,
-        );
+    fn delete_on_drop(&self) {
+        let res =
+            self.wanted_deleted
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

        if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_gcs();
+            LAYER_IMPL_METRICS.inc_started_deletes();
        }
    }

@@ -671,6 +664,10 @@ impl LayerInner {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

+                // count cancellations, which currently remain largely unexpected
+                let init_cancelled =
+                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));

@@ -679,6 +676,8 @@ impl LayerInner {
                    .upgrade()
                    .ok_or_else(|| DownloadError::TimelineShutdown)?;

+                // FIXME: grab a gate
+
                let can_ever_evict = timeline.remote_client.as_ref().is_some();

                // check if we really need to be downloaded; could have been already downloaded by a
@@ -739,6 +738,8 @@ impl LayerInner {
                    tracing::info!(waiters, "completing the on-demand download for other tasks");
                }

+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+
                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };

@@ -836,7 +837,7 @@ impl LayerInner {
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id.tenant_id),
+            Some(self.desc.tenant_shard_id),
            Some(self.desc.timeline_id),
            &task_name,
            false,
@@ -867,14 +868,13 @@ impl LayerInner {
                    match res {
                        (Ok(()), _) => {
                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded
-                            // or redownload.
+                            // else requests the layer, they'll find it already downloaded.
                            //
-                            // however, could be that we should consider marking the layer
-                            // for eviction? alas, cannot: because only DownloadedLayer
-                            // will handle that.
-                            tracing::info!("layer file download completed after requester had cancelled");
-                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
+                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
+                            //
+                            // FIXME(#6028): however, could be that we should consider marking the
+                            // layer for eviction? alas, cannot: because only DownloadedLayer will
+                            // handle that.
                        },
                        (Err(e), _) => {
                            // our caller is cancellation safe, but we might be racing with
@@ -994,12 +994,15 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
+        let delete = self.wanted_deleted.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;

-        if gc {
-            // do nothing now, only in LayerInner::drop
+        if delete {
+            // do nothing now, only in LayerInner::drop -- this was originally implemented because
+            // we could had already scheduled the deletion at the time.
+            //
+            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
        } else if can_evict && evict {
            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);

@@ -1014,7 +1017,7 @@ impl LayerInner {
            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                let _g = span.entered();

-                // if LayerInner is already dropped here, do nothing because the garbage collection
+                // if LayerInner is already dropped here, do nothing because the delete on drop
                // has already ran while we were in queue
                let Some(this) = this.upgrade() else {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1405,35 +1408,37 @@ impl From<ResidentLayer> for Layer {
    }
 }

-use metrics::{IntCounter, IntCounterVec};
+use metrics::IntCounter;

-struct LayerImplMetrics {
+pub(crate) struct LayerImplMetrics {
    started_evictions: IntCounter,
    completed_evictions: IntCounter,
-    cancelled_evictions: IntCounterVec,
+    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,

-    started_gcs: IntCounter,
-    completed_gcs: IntCounter,
-    failed_gcs: IntCounterVec,
+    started_deletes: IntCounter,
+    completed_deletes: IntCounter,
+    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,

-    rare_counters: IntCounterVec,
+    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
+    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
 }

 impl Default for LayerImplMetrics {
    fn default() -> Self {
-        let evictions = metrics::register_int_counter_vec!(
-            "pageserver_layer_evictions_count",
-            "Evictions started and completed in the Layer implementation",
-            &["state"]
+        use enum_map::Enum;
+
+        // reminder: these will be pageserver_layer_* with "_total" suffix
+
+        let started_evictions = metrics::register_int_counter!(
+            "pageserver_layer_started_evictions",
+            "Evictions started in the Layer implementation"
+        )
+        .unwrap();
+        let completed_evictions = metrics::register_int_counter!(
+            "pageserver_layer_completed_evictions",
+            "Evictions completed in the Layer implementation"
        )
        .unwrap();
-
-        let started_evictions = evictions
-            .get_metric_with_label_values(&["started"])
-            .unwrap();
-        let completed_evictions = evictions
-            .get_metric_with_label_values(&["completed"])
-            .unwrap();

        let cancelled_evictions = metrics::register_int_counter_vec!(
            "pageserver_layer_cancelled_evictions_count",
@@ -1442,24 +1447,36 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

-        // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
-        let gcs = metrics::register_int_counter_vec!(
-            "pageserver_layer_gcs_count",
-            "Garbage collections started and completed in the Layer implementation",
-            &["state"]
+        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = EvictionCancelled::from_usize(i);
+            let s = reason.as_str();
+            cancelled_evictions.with_label_values(&[s])
+        }));
+
+        let started_deletes = metrics::register_int_counter!(
+            "pageserver_layer_started_deletes",
+            "Deletions on drop pending in the Layer implementation"
+        )
+        .unwrap();
+        let completed_deletes = metrics::register_int_counter!(
+            "pageserver_layer_completed_deletes",
+            "Deletions on drop completed in the Layer implementation"
        )
        .unwrap();

-        let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
-        let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
-
-        let failed_gcs = metrics::register_int_counter_vec!(
-            "pageserver_layer_failed_gcs_count",
-            "Different reasons for garbage collections to have failed",
+        let failed_deletes = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_deletes_count",
+            "Different reasons for deletions on drop to have failed",
            &["reason"]
        )
        .unwrap();

+        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = DeleteFailed::from_usize(i);
+            let s = reason.as_str();
+            failed_deletes.with_label_values(&[s])
+        }));
+
        let rare_counters = metrics::register_int_counter_vec!(
            "pageserver_layer_assumed_rare_count",
            "Times unexpected or assumed rare event happened",
@@ -1467,16 +1484,29 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

+        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let event = RareEvent::from_usize(i);
+            let s = event.as_str();
+            rare_counters.with_label_values(&[s])
+        }));
+
+        let inits_cancelled = metrics::register_int_counter!(
+            "pageserver_layer_inits_cancelled_count",
+            "Times Layer initialization was cancelled",
+        )
+        .unwrap();
+
        Self {
            started_evictions,
            completed_evictions,
            cancelled_evictions,

-            started_gcs,
-            completed_gcs,
-            failed_gcs,
+            started_deletes,
+            completed_deletes,
+            failed_deletes,

            rare_counters,
+            inits_cancelled,
        }
    }
 }
@@ -1489,57 +1519,33 @@ impl LayerImplMetrics {
        self.completed_evictions.inc();
    }
    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions
-            .get_metric_with_label_values(&[reason.as_str()])
-            .unwrap()
-            .inc()
+        self.cancelled_evictions[reason].inc()
    }

-    fn inc_started_gcs(&self) {
-        self.started_gcs.inc();
+    fn inc_started_deletes(&self) {
+        self.started_deletes.inc();
    }
-    fn inc_completed_gcs(&self) {
-        self.completed_gcs.inc();
+    fn inc_completed_deletes(&self) {
+        self.completed_deletes.inc();
    }
-    fn inc_gcs_failed(&self, reason: GcFailed) {
-        self.failed_gcs
-            .get_metric_with_label_values(&[reason.as_str()])
-            .unwrap()
-            .inc();
+    fn inc_deletes_failed(&self, reason: DeleteFailed) {
+        self.failed_deletes[reason].inc();
    }

-    /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
-    /// failure to delete local file.
-    fn inc_gc_removes_failed(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["gc_remove_failed"])
-            .unwrap()
-            .inc();
+    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
+    /// attempt regardless of failure to delete local file.
+    fn inc_delete_removes_failed(&self) {
+        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    }

-    /// Expected rare because requires a race with `evict_blocking` and
-    /// `get_or_maybe_download`.
+    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["retried_gomd"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    }

-    /// Expected rare because cancellations are unexpected
-    fn inc_download_completed_without_requester(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["download_completed_without"])
-            .unwrap()
-            .inc();
-    }
-
-    /// Expected rare because cancellations are unexpected
+    /// Expected rare because cancellations are unexpected, and failures are unexpected
    fn inc_download_failed_without_requester(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["download_failed_without"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    }

    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1547,37 +1553,30 @@ impl LayerImplMetrics {
    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    /// Option.
    fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["raced_wanted_evicted"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    }

-    /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
+    /// These are only expected for [`Self::inc_init_cancelled`] amount when
    /// running with remote storage.
    fn inc_init_needed_no_download(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["init_needed_no_download"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::InitWithoutDownload].inc();
    }

    /// Expected rare because all layer files should be readable and good
    fn inc_permanent_loading_failures(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["permanent_loading_failure"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    }

    fn inc_broadcast_lagged(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["broadcast_lagged"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
+    }
+
+    fn inc_init_cancelled(&self) {
+        self.inits_cancelled.inc()
    }
 }

+#[derive(enum_map::Enum)]
 enum EvictionCancelled {
    LayerGone,
    TimelineGone,
@@ -1606,19 +1605,47 @@ impl EvictionCancelled {
    }
 }

-enum GcFailed {
+#[derive(enum_map::Enum)]
+enum DeleteFailed {
    TimelineGone,
    DeleteSchedulingFailed,
 }

-impl GcFailed {
+impl DeleteFailed {
    fn as_str(&self) -> &'static str {
        match self {
-            GcFailed::TimelineGone => "timeline_gone",
-            GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+            DeleteFailed::TimelineGone => "timeline_gone",
+            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
        }
    }
 }

-static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+#[derive(enum_map::Enum)]
+enum RareEvent {
+    RemoveOnDropFailed,
+    RetriedGetOrMaybeDownload,
+    DownloadFailedWithoutRequester,
+    UpgradedWantedEvicted,
+    InitWithoutDownload,
+    PermanentLoadingFailure,
+    EvictAndWaitLagged,
+}
+
+impl RareEvent {
+    fn as_str(&self) -> &'static str {
+        use RareEvent::*;
+
+        match self {
+            RemoveOnDropFailed => "remove_on_drop_failed",
+            RetriedGetOrMaybeDownload => "retried_gomd",
+            DownloadFailedWithoutRequester => "download_failed_without",
+            UpgradedWantedEvicted => "raced_wanted_evicted",
+            InitWithoutDownload => "init_needed_no_download",
+            PermanentLoadingFailure => "permanent_loading_failure",
+            EvictAndWaitLagged => "broadcast_lagged",
+        }
+    }
+}
+
+pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -87,13 +87,13 @@ pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
-    let tenant_id = tenant.tenant_shard_id.tenant_id;
+    let tenant_shard_id = tenant.tenant_shard_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        None,
-        &format!("compactor for tenant {tenant_id}"),
+        &format!("compactor for tenant {tenant_shard_id}"),
        false,
        {
            let tenant = Arc::clone(tenant);
@@ -105,7 +105,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                compaction_loop(tenant, cancel)
-                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
                Ok(())
            }
@@ -114,9 +114,9 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        None,
-        &format!("garbage collector for tenant {tenant_id}"),
+        &format!("garbage collector for tenant {tenant_shard_id}"),
        false,
        {
            let tenant = Arc::clone(tenant);
@@ -128,7 +128,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                gc_loop(tenant, cancel)
-                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
                Ok(())
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,7 @@ use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
 use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
 use pageserver_api::reltag::RelTag;
@@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp;
 use utils::{
    completion,
    generation::Generation,
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{AtomicLsn, Lsn, RecordLsn},
    seqwait::SeqWait,
    simple_rcu::{Rcu, RcuReadGuard},
@@ -478,7 +478,7 @@ impl Timeline {
            .map(|ancestor| ancestor.timeline_id)
    }

-    /// Lock and get timeline's GC cuttof
+    /// Lock and get timeline's GC cutoff
    pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
        self.latest_gc_cutoff_lsn.read()
    }
@@ -926,7 +926,7 @@ impl Timeline {
        tracing::debug!("Waiting for WalReceiverManager...");
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
        )
        .await;
@@ -977,7 +977,7 @@ impl Timeline {
        // Shut down the layer flush task before the remote client, as one depends on the other
        task_mgr::shutdown_tasks(
            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
        )
        .await;
@@ -995,12 +995,7 @@ impl Timeline {

        tracing::debug!("Waiting for tasks...");

-        task_mgr::shutdown_tasks(
-            None,
-            Some(self.tenant_shard_id.tenant_id),
-            Some(self.timeline_id),
-        )
-        .await;
+        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;

        // Finally wait until any gate-holders are complete
        self.gate.close().await;
@@ -1314,16 +1309,20 @@ impl Timeline {
                &self.conf.default_tenant_conf,
            );

-            // TODO(sharding): make evictions state shard aware
-            // (https://github.com/neondatabase/neon/issues/5953)
            let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());

            let timeline_id_str = self.timeline_id.to_string();
            self.metrics
                .evictions_with_low_residence_duration
                .write()
                .unwrap()
-                .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold);
+                .change_threshold(
+                    &tenant_id_str,
+                    &shard_id_str,
+                    &timeline_id_str,
+                    new_threshold,
+                );
        }
    }

@@ -1395,7 +1394,7 @@ impl Timeline {
                ancestor_lsn: metadata.ancestor_lsn(),

                metrics: TimelineMetrics::new(
-                    &tenant_shard_id.tenant_id,
+                    &tenant_shard_id,
                    &timeline_id,
                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
                        "mtime",
@@ -1496,7 +1495,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "layer flush task",
            false,
@@ -1847,7 +1846,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "initial size calculation",
            false,
@@ -2020,7 +2019,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "ondemand logical size calculation",
            false,
@@ -2279,7 +2278,7 @@ impl Timeline {
            }

            // Recurse into ancestor if needed
-            if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
                trace!(
                    "going into ancestor {}, cont_lsn is {}",
                    timeline.ancestor_lsn,
@@ -2461,13 +2460,7 @@ impl Timeline {
        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
        let (lsn, read_guard) = cache
-            .lookup_materialized_page(
-                self.tenant_shard_id.tenant_id,
-                self.timeline_id,
-                key,
-                lsn,
-                ctx,
-            )
+            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
            .await?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
@@ -2524,7 +2517,7 @@ impl Timeline {
        Ok(())
    }

-    fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
@@ -3209,7 +3202,7 @@ impl DurationRecorder {
 #[derive(Default)]
 struct CompactLevel0Phase1StatsBuilder {
    version: Option<u64>,
-    tenant_id: Option<TenantId>,
+    tenant_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
@@ -3226,7 +3219,7 @@ struct CompactLevel0Phase1StatsBuilder {
 #[derive(serde::Serialize)]
 struct CompactLevel0Phase1Stats {
    version: u64,
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -3745,7 +3738,7 @@ impl Timeline {
            let ctx = ctx.attached_child();
            let mut stats = CompactLevel0Phase1StatsBuilder {
                version: Some(2),
-                tenant_id: Some(self.tenant_shard_id.tenant_id),
+                tenant_id: Some(self.tenant_shard_id),
                timeline_id: Some(self.timeline_id),
                ..Default::default()
            };
@@ -3971,7 +3964,7 @@ impl Timeline {
        // for details. This will block until the old value is no longer in use.
        //
        // The GC cutoff should only ever move forwards.
-        {
+        let waitlist = {
            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
            ensure!(
                *write_guard <= new_gc_cutoff,
@@ -3979,8 +3972,9 @@ impl Timeline {
                *write_guard,
                new_gc_cutoff
            );
-            write_guard.store_and_unlock(new_gc_cutoff).wait();
-        }
+            write_guard.store_and_unlock(new_gc_cutoff)
+        };
+        waitlist.wait().await;

        info!("GC starting");

@@ -4206,7 +4200,7 @@ impl Timeline {
                    let cache = page_cache::get();
                    if let Err(e) = cache
                        .memorize_materialized_page(
-                            self.tenant_shard_id.tenant_id,
+                            self.tenant_shard_id,
                            self.timeline_id,
                            key,
                            last_rec_lsn,
@@ -4250,7 +4244,7 @@ impl Timeline {
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "download all remote layers task",
            false,
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // Shut down the layer flush task before the remote client, as one depends on the other
    task_mgr::shutdown_tasks(
        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    info!("waiting for timeline tasks to shutdown");
    task_mgr::shutdown_tasks(
        None,
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -528,7 +528,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
            false,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
@@ -343,7 +343,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -243,7 +243,7 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.garbage_collect_on_drop();
+        layer.delete_on_drop();
    }

    pub(crate) fn contains(&self, layer: &Layer) -> bool {
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

+use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
@@ -41,7 +42,7 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use utils::id::TenantTimelineId;
+use utils::id::TimelineId;

 use self::connection_manager::ConnectionManagerStatus;

@@ -60,7 +61,8 @@ pub struct WalReceiverConf {
 }

 pub struct WalReceiver {
-    timeline: TenantTimelineId,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }

@@ -71,7 +73,7 @@ impl WalReceiver {
        mut broker_client: BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Self {
-        let tenant_id = timeline.tenant_shard_id.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -81,9 +83,9 @@ impl WalReceiver {
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
-            Some(tenant_id),
+            Some(timeline.tenant_shard_id),
            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
+            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
            false,
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
@@ -117,11 +119,12 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
        );

        Self {
-            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            tenant_shard_id,
+            timeline_id,
            manager_status,
        }
    }
@@ -129,8 +132,8 @@ impl WalReceiver {
    pub async fn stop(self) {
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
-            Some(self.timeline.tenant_id),
-            Some(self.timeline.timeline_id),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
        )
        .await;
    }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
    task_mgr::spawn(
        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
        "walreceiver connection",
        false,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -288,6 +288,9 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

+        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
+        // where our caller doesn't get to use the returned VirtualFile before its
+        // slot gets re-used by someone else.
        let file = STORAGE_IO_TIME_METRIC
            .get(StorageIoOperation::Open)
            .observe_closure_duration(|| open_options.open(path))?;
@@ -311,6 +314,9 @@ impl VirtualFile {
            timeline_id,
        };

+        // TODO: Under pressure, it's likely the slot will get re-used and
+        // the underlying file closed before they get around to using it.
+        // => https://github.com/neondatabase/neon/issues/6065
        slot_guard.file.replace(file);

        Ok(vfile)
@@ -421,9 +427,12 @@ impl VirtualFile {
        // now locked in write-mode. Find a free slot to put it in.
        let (handle, mut slot_guard) = open_files.find_victim_slot();

-        // Open the physical file
+        // Re-open the physical file.
+        // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
+        // case from StorageIoOperation::Open. This helps with identifying thrashing
+        // of the virtual file descriptor cache.
        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
+            .get(StorageIoOperation::OpenAfterReplace)
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
@@ -610,9 +619,11 @@ impl Drop for VirtualFile {
            slot.recently_used.store(false, Ordering::Relaxed);
            // there is also operation "close-by-replace" for closes done on eviction for
            // comparison.
-            STORAGE_IO_TIME_METRIC
-                .get(StorageIoOperation::Close)
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+            if let Some(fd) = slot_guard.file.take() {
+                STORAGE_IO_TIME_METRIC
+                    .get(StorageIoOperation::Close)
+                    .observe_closure_duration(|| drop(fd));
+            }
        }
    }
 }
@@ -643,6 +654,7 @@ pub fn init(num_slots: usize) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
+    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

 const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -372,10 +372,11 @@ impl<'a> WalIngest<'a> {
            let key_is_local = self.shard.is_key_local(&key);

            tracing::debug!(
-                "ingest: shard decision {} (checkpoint={}) for key {}",
+                lsn=%lsn,
+                key=%key,
+                "ingest: shard decision {} (checkpoint={})",
                if !key_is_local { "drop" } else { "keep" },
-                self.checkpoint_modified,
-                key
+                self.checkpoint_modified
            );

            if !key_is_local {
@@ -402,13 +403,14 @@ impl<'a> WalIngest<'a> {
        if modification.is_empty() {
            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
            WAL_INGEST.records_filtered.inc();
-            return Ok(());
+            modification.tline.finish_write(lsn);
+        } else {
+            WAL_INGEST.records_committed.inc();
+            modification.commit(ctx).await?;
        }

        // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        WAL_INGEST.records_committed.inc();
-        modification.commit(ctx).await?;
+        // checkpoint data, let the repository know that it is up-to-date to this LSN.

        Ok(())
    }
@@ -456,8 +458,10 @@ impl<'a> WalIngest<'a> {
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
            && (decoded.xl_info == pg_constants::XLOG_FPI
                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-        // compression of WAL is not yet supported: fall back to storing the original WAL record
+            // compression of WAL is not yet supported: fall back to storing the original WAL record
            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            // do not materialize null pages because them most likely be soon replaced with real data
+            && blk.bimg_len != 0
        {
            // Extract page image from FPI record
            let img_len = blk.bimg_len as usize;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -41,6 +41,17 @@ libwalproposer.a: $(WALPROP_OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^

+# needs vars:
+# FIND_TYPEDEF pointing to find_typedef
+# INDENT pointing to pg_bsd_indent
+# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
+#   pgindent will pick it up as pg_bsd_indent path).
+.PHONY: pgindent
+pgindent:
+	+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
+	$(FIND_TYPEDEF) . > neon.typedefs
+	INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
+
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;

 /* Curl structures for sending the HTTP requests */
-static CURL * CurlHandle;
+static CURL *CurlHandle;
 static struct curl_slist *ContentHeader = NULL;

 /*
@@ -54,7 +54,7 @@ typedef enum
 {
 	Op_Set,						/* An upsert: Either a creation or an alter */
 	Op_Delete,
-}			OpType;
+} OpType;

 typedef struct
 {
@@ -62,7 +62,7 @@ typedef struct
 	Oid			owner;
 	char		old_name[NAMEDATALEN];
 	OpType		type;
-}			DbEntry;
+} DbEntry;

 typedef struct
 {
@@ -70,7 +70,7 @@ typedef struct
 	char		old_name[NAMEDATALEN];
 	const char *password;
 	OpType		type;
-}			RoleEntry;
+} RoleEntry;

 /*
 * We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
 	struct DdlHashTable *prev_table;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
-}			DdlHashTable;
+} DdlHashTable;

 static DdlHashTable RootTable;
-static DdlHashTable * CurrentDdlTable = &RootTable;
+static DdlHashTable *CurrentDdlTable = &RootTable;

 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
 {
 	char		str[ERROR_SIZE];
 	size_t		size;
-}			ErrorString;
+} ErrorString;

 static size_t
 ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
@@ -478,7 +478,7 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
-    return strcmp(role_name, "neon_superuser") == 0;
+	return strcmp(role_name, "neon_superuser") == 0;
 }

 static void
@@ -509,6 +509,7 @@ HandleCreateDb(CreatedbStmt *stmt)
 	if (downer && downer->arg)
 	{
 		const char *owner_name = defGetString(downer);
+
 		if (RoleIsNeonSuperuser(owner_name))
 			elog(ERROR, "can't create a database with owner neon_superuser");
 		entry->owner = get_role_oid(owner_name, false);
@@ -536,6 +537,7 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 	const char *new_owner = get_rolespec_name(stmt->newowner);
+
 	if (RoleIsNeonSuperuser(new_owner))
 		elog(ERROR, "can't alter owner to neon_superuser");
 	entry->owner = get_role_oid(new_owner, false);
@@ -633,6 +635,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	DefElem    *dpass = NULL;
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;
+
 	if (RoleIsNeonSuperuser(role_name))
 		elog(ERROR, "can't ALTER neon_superuser");

--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -25,79 +25,81 @@

 #include <curl/curl.h>

-static int extension_server_port = 0;
+static int	extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;

-// to download all SQL (and data) files for an extension:
-// curl -X POST http://localhost:8080/extension_server/postgis
-// it covers two possible extension files layouts:
-// 1. extension_name--version--platform.sql
-// 2. extension_name/extension_name--version.sql
-//    extension_name/extra_files.csv
-//
-// to download specific library file:
-// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+/*
+  * to download all SQL (and data) files for an extension:
+  * curl -X POST http://localhost:8080/extension_server/postgis
+  * it covers two possible extension files layouts:
+  * 1. extension_name--version--platform.sql
+  * 2. extension_name/extension_name--version.sql
+  *    extension_name/extra_files.csv
+  * to download specific library file:
+  * curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+  */
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-    CURL *curl;
-    CURLcode res;
-    char *compute_ctl_url;
-    char *postdata;
-    bool ret = false;
+	CURL	   *curl;
+	CURLcode	res;
+	char	   *compute_ctl_url;
+	char	   *postdata;
+	bool		ret = false;

-    if ((curl = curl_easy_init()) == NULL)
-    {
-        elog(ERROR, "Failed to initialize curl handle");
-    }
+	if ((curl = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}

-    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+							   extension_server_port, filename, is_library ? "?is_library=true" : "");

-    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);

-    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
+	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );

-    if (curl)
-    {
-        /* Perform the request, res will get the return code */
-        res = curl_easy_perform(curl);
-        /* Check for errors */
-        if (res == CURLE_OK)
-        {
-            ret = true;
-        }
-        else
-        {
-            // Don't error here because postgres will try to find the file
-            // and will fail with some proper error message if it's not found.
-            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-        }
+	if (curl)
+	{
+		/* Perform the request, res will get the return code */
+		res = curl_easy_perform(curl);
+		/* Check for errors */
+		if (res == CURLE_OK)
+		{
+			ret = true;
+		}
+		else
+		{
+			/* Don't error here because postgres will try to find the file */
+			/* and will fail with some proper error message if it's not found. */
+			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+		}

-        /* always cleanup */
-        curl_easy_cleanup(curl);
-    }
+		/* always cleanup */
+		curl_easy_cleanup(curl);
+	}

-    return ret;
+	return ret;
 }

-void pg_init_extension_server()
+void
+pg_init_extension_server()
 {
-    // Port to connect to compute_ctl on localhost
-    // to request extension files.
-    DefineCustomIntVariable("neon.extension_server_port",
-                            "connection string to the compute_ctl",
-                            NULL,
-                            &extension_server_port,
-                            0, 0, INT_MAX,
-                            PGC_POSTMASTER,
-                            0, /* no flags required */
-                            NULL, NULL, NULL);
+	/* Port to connect to compute_ctl on localhost */
+	/* to request extension files. */
+	DefineCustomIntVariable("neon.extension_server_port",
+							"connection string to the compute_ctl",
+							NULL,
+							&extension_server_port,
+							0, 0, INT_MAX,
+							PGC_POSTMASTER,
+							0,	/* no flags required */
+							NULL, NULL, NULL);

-    // set download_extension_file_hook
-    prev_download_extension_file_hook = download_extension_file_hook;
-    download_extension_file_hook = neon_download_extension_file_http;
+	/* set download_extension_file_hook */
+	prev_download_extension_file_hook = download_extension_file_hook;
+	download_extension_file_hook = neon_download_extension_file_http;
 }
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -67,32 +67,34 @@
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
-	uint32      hash;
+	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK/32];
-	dlist_node	lru_node; /* LRU list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
+	dlist_node	lru_node;		/* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
 {
-	uint64 generation; /* generation is needed to handle correct hash reenabling */
-	uint32 size; /* size of cache file in chunks */
-	uint32 used; /* number of used chunks */
-	uint32 limit; /* shared copy of lfc_size_limit */
-	uint64 hits;
-	uint64 misses;
-	uint64 writes;
-	dlist_head lru; /* double linked list for LRU replacement algorithm */
+	uint64		generation;		/* generation is needed to handle correct hash
+								 * reenabling */
+	uint32		size;			/* size of cache file in chunks */
+	uint32		used;			/* number of used chunks */
+	uint32		limit;			/* shared copy of lfc_size_limit */
+	uint64		hits;
+	uint64		misses;
+	uint64		writes;
+	dlist_head	lru;			/* double linked list for LRU replacement
+								 * algorithm */
 } FileCacheControl;

-static HTAB* lfc_hash;
-static int   lfc_desc = 0;
+static HTAB *lfc_hash;
+static int	lfc_desc = 0;
 static LWLockId lfc_lock;
-static int   lfc_max_size;
-static int   lfc_size_limit;
-static char* lfc_path;
-static  FileCacheControl* lfc_ctl;
+static int	lfc_max_size;
+static int	lfc_size_limit;
+static char *lfc_path;
+static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
@@ -100,7 +102,7 @@ static shmem_request_hook_type prev_shmem_request_hook;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

-void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
+void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);

 /*
 * Local file cache is optional and Neon can work without it.
@@ -109,9 +111,10 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
 * All cache content should be invalidated to avoid reading of stale or corrupted data
 */
 static void
-lfc_disable(char const* op)
+lfc_disable(char const *op)
 {
-	int fd;
+	int			fd;
+
 	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

 	/* Invalidate hash */
@@ -120,7 +123,7 @@ lfc_disable(char const* op)
 	if (LFC_ENABLED())
 	{
 		HASH_SEQ_STATUS status;
-		FileCacheEntry* entry;
+		FileCacheEntry *entry;

 		hash_seq_init(&status, lfc_hash);
 		while ((entry = hash_seq_search(&status)) != NULL)
@@ -135,16 +138,24 @@ lfc_disable(char const* op)

 		if (lfc_desc > 0)
 		{
-			/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
-			int rc = ftruncate(lfc_desc, 0);
+			/*
+			 * If the reason of error is ENOSPC, then truncation of file may
+			 * help to reclaim some space
+			 */
+			int			rc = ftruncate(lfc_desc, 0);
+
 			if (rc < 0)
 				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
 		}
 	}
-	/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
+
+	/*
+	 * We need to use unlink to to avoid races in LFC write, because it is not
+	 * protectedby
+	 */
 	unlink(lfc_path);

-	fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
+	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 	if (fd < 0)
 		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
 	else
@@ -170,13 +181,15 @@ lfc_maybe_disabled(void)
 static bool
 lfc_ensure_opened(void)
 {
-	bool enabled = !lfc_maybe_disabled();
+	bool		enabled = !lfc_maybe_disabled();
+
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0 && enabled)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR);

-		if (lfc_desc < 0) {
+		if (lfc_desc < 0)
+		{
 			lfc_disable("open");
 			return false;
 		}
@@ -187,7 +200,7 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
-	bool found;
+	bool		found;
 	static HASHCTL info;

 	if (prev_shmem_startup_hook)
@@ -197,17 +210,22 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
 	{
-		int fd;
-		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
-		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
+		int			fd;
+		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+
+		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
+
+		/*
+		 * lfc_size+1 because we add new element to hash table before eviction
+		 * of victim
+		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
-								 lfc_size+1, lfc_size+1,
+								 lfc_size + 1, lfc_size + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -219,7 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);

 		/* Recreate file cache on restart */
-		fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
+		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
 			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
@@ -242,7 +260,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }

@@ -250,9 +268,11 @@ static bool
 is_normal_backend(void)
 {
 	/*
-	 * Stats collector detach shared memory, so we should not try to access shared memory here.
-	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
-	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
+	 * Stats collector detach shared memory, so we should not try to access
+	 * shared memory here. Parallel workers first assign default value (0), so
+	 * not perform truncation in parallel workers. The Postmaster can handle
+	 * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
+	 * but has no PGPROC.
 	 */
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }
@@ -271,7 +291,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 static void
 lfc_change_limit_hook(int newval, void *extra)
 {
-	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
+	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);

 	if (!is_normal_backend())
 		return;
@@ -283,18 +303,22 @@ lfc_change_limit_hook(int newval, void *extra)

 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
-		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
-		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		/*
+		 * Shrink cache by throwing away least recently accessed chunks and
+		 * returning their space to file system
+		 */
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
-			elog(LOG, "Failed to punch hole in file: %m");
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
+			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
-	elog(DEBUG1, "set local file cache limit to %d", new_size);
+	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
 }
@@ -307,14 +331,14 @@ lfc_init(void)
 	 * shared_preload_libraries.
 	 */
 	if (!process_shared_preload_libraries_in_progress)
-		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+		neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");


 	DefineCustomIntVariable("neon.max_file_cache_size",
 							"Maximal size of Neon local file cache",
 							NULL,
 							&lfc_max_size,
-							0, /* disabled by default */
+							0,	/* disabled by default */
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
@@ -327,7 +351,7 @@ lfc_init(void)
 							"Current limit for size of Neon local file cache",
 							NULL,
 							&lfc_size_limit,
-							0, /* disabled by default */
+							0,	/* disabled by default */
 							0,
 							INT_MAX,
 							PGC_SIGHUP,
@@ -367,18 +391,18 @@ lfc_init(void)
 bool
 lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	bool found = false;
-	uint32 hash;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	bool		found = false;
+	uint32		hash;

-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -397,13 +421,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 void
 lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	bool found;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	uint32 hash;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	bool		found;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	uint32		hash;

-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -438,9 +462,10 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	 */
 	if (entry->bitmap[chunk_offs >> 5] == 0)
 	{
-		bool has_remaining_pages;
+		bool		has_remaining_pages;

-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
+		{
 			if (entry->bitmap[i] != 0)
 			{
 				has_remaining_pages = true;
@@ -449,8 +474,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		}

 		/*
-		 * Put the entry at the position that is first to be reclaimed when
-		 * we have no cached pages remaining in the chunk
+		 * Put the entry at the position that is first to be reclaimed when we
+		 * have no cached pages remaining in the chunk
 		 */
 		if (!has_remaining_pages)
 		{
@@ -476,16 +501,16 @@ bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	ssize_t rc;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	bool result = true;
-	uint32 hash;
-	uint64 generation;
-	uint32 entry_offset;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	ssize_t		rc;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	bool		result = true;
+	uint32		hash;
+	uint64		generation;
+	uint32		entry_offset;

-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

 	if (!lfc_ensure_opened())
@@ -493,7 +518,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -520,7 +545,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("read");
@@ -551,30 +576,29 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-		  char *buffer)
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
-		  const void *buffer)
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
 #endif
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	ssize_t rc;
-	bool found;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	uint32 hash;
-	uint64 generation;
-	uint32 entry_offset;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	ssize_t		rc;
+	bool		found;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	uint32		hash;
+	uint64		generation;
+	uint32		entry_offset;

-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

 	if (!lfc_ensure_opened())
 		return;

 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	hash = get_hash_value(lfc_hash, &tag);

@@ -590,33 +614,46 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	if (found)
 	{
-		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		/*
+		 * Unlink entry from LRU list to pin it for the duration of IO
+		 * operation
+		 */
 		if (entry->access_count++ == 0)
 			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
 		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
-		 * 1. Wait until some of this operation is completed and pages is unpinned
-		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
-		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
-		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
-		 * we prefer not to complicate code and use second approach.
+		 * We have two choices if all cache pages are pinned (i.e. used in IO
+		 * operations):
+		 *
+		 * 1) Wait until some of this operation is completed and pages is
+		 * unpinned.
+		 *
+		 * 2) Allocate one more chunk, so that specified cache size is more
+		 * recommendation than hard limit.
+		 *
+		 * As far as probability of such event (that all pages are pinned) is
+		 * considered to be very very small: there are should be very large
+		 * number of concurrent IO operations and them are limited by
+		 * max_connections, we prefer not to complicate code and use second
+		 * approach.
 		 */
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-			elog(DEBUG2, "Swap file cache page");
+			neon_log(DEBUG2, "Swap file cache page");
 		}
 		else
 		{
 			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
+												 * of file */
 		}
 		entry->access_count = 1;
 		entry->hash = hash;
@@ -628,7 +665,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	lfc_ctl->writes += 1;
 	LWLockRelease(lfc_lock);

-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("write");
@@ -665,13 +702,13 @@ Datum
 neon_get_lfc_stats(PG_FUNCTION_ARGS)
 {
 	FuncCallContext *funcctx;
-	NeonGetStatsCtx* fctx;
+	NeonGetStatsCtx *fctx;
 	MemoryContext oldcontext;
 	TupleDesc	tupledesc;
 	Datum		result;
 	HeapTuple	tuple;
-	char const* key;
-	uint64      value;
+	char const *key;
+	uint64		value;
 	Datum		values[NUM_NEON_GET_STATS_COLS];
 	bool		nulls[NUM_NEON_GET_STATS_COLS];

@@ -683,7 +720,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

 		/* Create a user function context for cross-call persistence */
-		fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));
+		fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));

 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
@@ -704,7 +741,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 	funcctx = SRF_PERCALL_SETUP();

 	/* Get the saved state */
-	fctx = (NeonGetStatsCtx*) funcctx->user_fctx;
+	fctx = (NeonGetStatsCtx *) funcctx->user_fctx;

 	switch (funcctx->call_cntr)
 	{
@@ -792,9 +829,9 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-        HASH_SEQ_STATUS status;
-		FileCacheEntry* entry;
-		uint32 n_pages = 0;
+		HASH_SEQ_STATUS status;
+		FileCacheEntry *entry;
+		uint32		n_pages = 0;

 		funcctx = SRF_FIRSTCALL_INIT();

@@ -813,10 +850,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		 * wrong) function definition though.
 		 */
 		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
+			neon_log(ERROR, "return type must be a row type");

 		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
-			elog(ERROR, "incorrect number of output arguments");
+			neon_log(ERROR, "incorrect number of output arguments");

 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
@@ -851,7 +888,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
+					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
@@ -870,10 +907,11 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		if (n_pages != 0)
 		{
 			/*
-			 * Scan through all the cache entries, saving the relevant fields in the
-			 * fctx->record structure.
+			 * Scan through all the cache entries, saving the relevant fields
+			 * in the fctx->record structure.
 			 */
-			uint32 n = 0;
+			uint32		n = 0;
+
 			hash_seq_init(&status, lfc_hash);
 			while ((entry = hash_seq_search(&status)) != NULL)
 			{
@@ -881,7 +919,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				{
 					if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 					{
-						fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
+						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -165,7 +165,7 @@ load_shard_map(shardno_t shard_no, char* connstr)

 		n_shards = shard_map->n_shards;
 		if (shard_no >= n_shards)
-			elog(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);
+			neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);

 		if (connstr)
 			strncpy(connstr, shard_map->shard_connstr[shard_no], MAX_PS_CONNSTR_LEN);
@@ -175,17 +175,18 @@ load_shard_map(shardno_t shard_no, char* connstr)
 		   || begin_update_counter != pg_atomic_read_u64(&shard_map->begin_update_counter)
 		   || end_update_counter != pg_atomic_read_u64(&shard_map->end_update_counter));

+
 	if (shard_map_update_counter != end_update_counter)
- 	{
+	{
 		/* Reset all connections if connection strings are changed */
- 		for (shardno_t i = 0; i < max_attached_shard_no; i++)
- 		{
- 			if (page_servers[i].conn)
- 				pageserver_disconnect(i);
- 		}
+		for (shardno_t i = 0; i < max_attached_shard_no; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
 		max_attached_shard_no = 0;
 		shard_map_update_counter = end_update_counter;
-    }
+	}

 	return n_shards;
 }
@@ -230,11 +231,10 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
 	 * variable was set, use that as the password.
 	 *
-	 * The connection options are parsed in the order they're given, so
-	 * when we set the password before the connection string, the
-	 * connection string can override the password from the env variable.
-	 * Seems useful, although we don't currently use that capability
-	 * anywhere.
+	 * The connection options are parsed in the order they're given, so when
+	 * we set the password before the connection string, the connection string
+	 * can override the password from the env variable. Seems useful, although
+	 * we don't currently use that capability anywhere.
 	 */
 	n = 0;
 	if (neon_auth_token)
@@ -259,7 +259,7 @@ pageserver_connect(shardno_t shard_no, int elevel)

 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
 		return false;
 	}
@@ -268,15 +268,15 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	if (ret != 1)
 	{
 		PQfinish(conn);
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}

 	wes = CreateWaitEventSet(TopMemoryContext, 3);
 	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
-			  MyLatch, NULL);
+					  MyLatch, NULL);
 	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-			  NULL, NULL);
+					  NULL, NULL);
 	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

 	while (PQisBusy(conn))
@@ -299,14 +299,14 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				PQfinish(conn);
 				FreeWaitEventSet(wes);

-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
+				neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+							   msg);
 				return false;
 			}
 		}
 	}

-	neon_log(LOG, "libpagestore: connected to '%s'", connstr);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;
 	max_attached_shard_no = Max(shard_no+1, max_attached_shard_no);
@@ -341,7 +341,8 @@ retry:
 			if (!PQconsumeInput(pageserver_conn))
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -366,7 +367,7 @@ pageserver_disconnect(shardno_t shard_no)
 	 */
 	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
 		PQfinish(page_servers[shard_no].conn);
 		page_servers[shard_no].conn = NULL;

@@ -380,7 +381,7 @@ pageserver_disconnect(shardno_t shard_no)
 }

 static bool
-pageserver_send(shardno_t shard_no, NeonRequest * request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
 	PGconn* pageserver_conn = page_servers[shard_no].conn;
@@ -388,7 +389,7 @@ pageserver_send(shardno_t shard_no, NeonRequest * request)
 	/* If the connection was lost for some reason, reconnect */
 	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
 	}

@@ -396,10 +397,12 @@ pageserver_send(shardno_t shard_no, NeonRequest * request)

 	/*
 	 * If pageserver is stopped, the connections from compute node are broken.
-	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
-	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
-	 * See https://github.com/neondatabase/neon/issues/1138
-	 * So try to reestablish connection in case of failure.
+	 * The compute node doesn't notice that immediately, but it will cause the
+	 * next request to fail, usually on the next query. That causes
+	 * user-visible errors if pageserver is restarted, or the tenant is moved
+	 * from one pageserver to another. See
+	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
+	 * connection in case of failure.
 	 */
 	if (!page_servers[shard_no].conn)
 	{
@@ -426,7 +429,7 @@ pageserver_send(shardno_t shard_no, NeonRequest * request)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 		pageserver_disconnect(shard_no);
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -438,7 +441,7 @@ pageserver_send(shardno_t shard_no, NeonRequest * request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);

-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
@@ -470,31 +473,32 @@ pageserver_receive(shardno_t shard_no)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);

-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
-			char* msg = pchomp(PQerrorMessage(pageserver_conn));
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 			pageserver_disconnect(shard_no);
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
 			pageserver_disconnect(shard_no);
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
@@ -510,15 +514,16 @@ pageserver_flush(shardno_t shard_no)
 	PGconn* pageserver_conn = page_servers[shard_no].conn;
 	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 			pageserver_disconnect(shard_no);
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -548,8 +553,8 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	 * Load shard map only at Postmaster.
 	 * If old page server is not available, then backends can be blocked in attempts to reconnect to it and do not reload config in this loop
 	 */
-	if (shard_map != NULL && (MyProcPid == PostmasterPid || shard_map->n_shards == 0))
- 	{
+	if (shard_map != NULL && UsedShmemSegAddr != NULL && (MyProcPid == PostmasterPid || shard_map->n_shards == 0))
+	{
 		char const* shard_connstr = newval;
 		char const* sep;
 		size_t connstr_len;
@@ -563,12 +568,12 @@ AssignPageserverConnstring(const char *newval, void *extra)
 				break; /* trailing comma */
 			if (i >= MAX_SHARDS)
 			{
-				elog(LOG, "Too many shards");
+				neon_log(LOG, "Too many shards");
 				return;
 			}
 			if (connstr_len >= MAX_PS_CONNSTR_LEN)
 			{
-				elog(LOG, "Connection  string too long");
+				neon_log(LOG, "Connection  string too long");
 				return;
 			}
 			if (i >= shard_map->n_shards ||
@@ -587,7 +592,7 @@ AssignPageserverConnstring(const char *newval, void *extra)

 		if (i == 0)
 		{
-			elog(LOG, "No shards were specified");
+			neon_log(LOG, "No shards were specified");
 			return;
 		}
 		if (shard_map_changed)
@@ -686,7 +691,10 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;

-	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
+	/*
+	 * Retrieve the auth token to use when connecting to pageserver and
+	 * safekeepers
+	 */
 	neon_auth_token = getenv("NEON_AUTH_TOKEN");
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,9 +48,11 @@ _PG_init(void)

 	pg_init_extension_server();

-	// Important: This must happen after other parts of the extension
-	// are loaded, otherwise any settings to GUCs that were set before
-	// the extension was loaded will be removed.
+	/*
+	 * Important: This must happen after other parts of the extension are
+	 * loaded, otherwise any settings to GUCs that were set before the
+	 * extension was loaded will be removed.
+	 */
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
 * block_id; false otherwise.
 */
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -59,7 +59,7 @@

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers

-#else /* major version >= 16 */
+#else							/* major version >= 16 */

 #define USE_RELFILELOCATOR

@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#endif //NEON_PGVERSIONCOMPAT_H
+#endif							/* NEON_PGVERSIONCOMPAT_H */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -16,6 +16,7 @@
 #include "postgres.h"
 #include "neon_pgversioncompat.h"

+#include "access/slru.h"
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "storage/block.h"
@@ -37,6 +38,7 @@ typedef enum
 	T_NeonNblocksRequest,
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
+	T_NeonGetSlruSegmentRequest,

 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -44,13 +46,14 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
-}			NeonMessageTag;
+	T_NeonGetSlruSegmentResponse,
+} NeonMessageTag;

 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
-}			NeonMessage;
+} NeonMessage;

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

@@ -58,6 +61,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
@@ -71,27 +77,27 @@ typedef struct
 	NeonMessageTag tag;
 	bool		latest;			/* if true, request latest page version */
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-}			NeonRequest;
+} NeonRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-}			NeonExistsRequest;
+} NeonExistsRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-}			NeonNblocksRequest;
+} NeonNblocksRequest;

 typedef struct
 {
 	NeonRequest req;
 	Oid			dbNode;
-}			NeonDbSizeRequest;
+} NeonDbSizeRequest;

 typedef struct
 {
@@ -99,31 +105,38 @@ typedef struct
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-}			NeonGetPageRequest;
+} NeonGetPageRequest;
+
+typedef struct
+{
+	NeonRequest req;
+	SlruKind kind;
+	int      segno;
+} NeonGetSlruSegmentRequest;

 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
 	NeonMessageTag tag;
-}			NeonResponse;
+} NeonResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	bool		exists;
-}			NeonExistsResponse;
+} NeonExistsResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	uint32		n_blocks;
-}			NeonNblocksResponse;
+} NeonNblocksResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
-}			NeonGetPageResponse;
+} NeonGetPageResponse;

 #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))

@@ -131,18 +144,26 @@ typedef struct
 {
 	NeonMessageTag tag;
 	int64		db_size;
-}			NeonDbSizeResponse;
+} NeonDbSizeResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
-}			NeonErrorResponse;
+} NeonErrorResponse;

-extern StringInfoData nm_pack_request(NeonRequest * msg);
-extern NeonResponse * nm_unpack_response(StringInfo s);
-extern char *nm_to_string(NeonMessage * msg);
+typedef struct
+{
+	NeonMessageTag tag;
+	int         n_blocks;
+	char		data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
+} NeonGetSlruSegmentResponse;
+
+
+extern StringInfoData nm_pack_request(NeonRequest *msg);
+extern NeonResponse *nm_unpack_response(StringInfo s);
+extern char *nm_to_string(NeonMessage *msg);

 /*
 * API
@@ -155,17 +176,17 @@ typedef struct
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	NeonResponse *(*receive) (shardno_t shard_no);
 	bool		(*flush) (shardno_t shard_no);
-}			page_server_api;
+} page_server_api;

 extern void prefetch_on_ps_disconnect(void);

-extern page_server_api * page_server;
+extern page_server_api *page_server;

 extern char *page_server_connstring;
-extern int flush_every_n_requests;
-extern int readahead_buffer_size;
+extern int	flush_every_n_requests;
+extern int	readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
-extern int seqscan_prefetch_distance;
+extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -202,14 +223,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp)
 	if (wp->propTermHistory.entries != NULL)
 		pfree(wp->propTermHistory.entries);
 	wp->propTermHistory.entries = NULL;
-	
+
 	pfree(wp);
 }

@@ -275,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-						 sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -395,7 +395,7 @@ ResetConnection(Safekeeper *sk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk));
+					sk->host, sk->port, wp->api.conn_error_message(sk));

 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -489,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-				 sk->host, sk->port);
+						sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */

@@ -525,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -554,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -580,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	{
 		case WP_CONN_POLLING_OK:
 			walprop_log(LOG, "connected with node %s:%s", sk->host,
-				 sk->port);
+						sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 			/*
@@ -604,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)

 		case WP_CONN_POLLING_FAILED:
 			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk));
+						sk->host, sk->port, wp->api.conn_error_message(sk));

 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -641,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
 		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk));
+					sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -678,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)

 		case WP_EXEC_FAILED:
 			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk));
+						sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;

@@ -689,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
 			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-				 sk->host, sk->port);
+						sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -758,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	{
 		/* Another compute with higher term is running. */
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			 sk->host, sk->port,
-			 sk->greetResponse.term, wp->propTerm);
+					sk->host, sk->port,
+					sk->greetResponse.term, wp->propTerm);
 	}

 	/*
@@ -817,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	walprop_log(LOG,
-		 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-		 sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-		 LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		 LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-		 LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -832,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			 sk->host, sk->port,
-			 sk->voteResponse.term, wp->propTerm);
+					sk->host, sk->port,
+					sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);

@@ -877,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
 	if (wp->truncateLsn < wp->propEpochStartLsn)
 	{
 		walprop_log(LOG,
-			 "start recovery because truncateLsn=%X/%X is not "
-			 "equal to epochStartLsn=%X/%X",
-			 LSN_FORMAT_ARGS(wp->truncateLsn),
-			 LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+					"start recovery because truncateLsn=%X/%X is not "
+					"equal to epochStartLsn=%X/%X",
+					LSN_FORMAT_ARGS(wp->truncateLsn),
+					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 		/* Perform recovery */
 		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
 			walprop_log(FATAL, "Failed to recover state");
@@ -990,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
 					walprop_log(WARNING,
-						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						 LSN_FORMAT_ARGS(wp->timelineStartLsn),
-						 LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+								LSN_FORMAT_ARGS(wp->timelineStartLsn),
+								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -1038,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;

 	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		 wp->quorum,
-		 wp->propTerm,
-		 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-		 wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-		 LSN_FORMAT_ARGS(wp->truncateLsn));
+				wp->quorum,
+				wp->propTerm,
+				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+				LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1070,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
 											walprop_shared->mineLastElectedTerm)))
 			{
 				walprop_log(PANIC,
-					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-					 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}

 	/*
-	 * WalProposer has just elected itself and initialized history, so
-	 * we can call election callback. Usually it updates truncateLsn to
-	 * fetch WAL for logical replication.
+	 * WalProposer has just elected itself and initialized history, so we can
+	 * call election callback. Usually it updates truncateLsn to fetch WAL for
+	 * logical replication.
 	 */
 	wp->api.after_election(wp);
 }
@@ -1155,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
 			sk->startStreamingAt = wp->truncateLsn;

 			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-				 sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-				 LSN_FORMAT_ARGS(sk->startStreamingAt));
+						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+						LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
 	}
 	else
@@ -1190,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)

 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
 	walprop_log(LOG,
-		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));

 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1355,11 +1355,11 @@ SendAppendRequests(Safekeeper *sk)
 		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);

 		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+					req->endLsn - req->beginLsn,
+					LSN_FORMAT_ARGS(req->beginLsn),
+					LSN_FORMAT_ARGS(req->endLsn),
+					LSN_FORMAT_ARGS(req->commitLsn),
+					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);

 		resetStringInfo(&sk->outbuf);

@@ -1398,8 +1398,8 @@ SendAppendRequests(Safekeeper *sk)

 			case PG_ASYNC_WRITE_FAIL:
 				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					 sk->host, sk->port, FormatSafekeeperState(sk->state),
-					 wp->api.conn_error_message(sk));
+							sk->host, sk->port, FormatSafekeeperState(sk->state),
+							wp->api.conn_error_message(sk));
 				ShutdownConnection(sk);
 				return false;
 			default:
@@ -1438,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
 			break;

 		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-						sk->appendResponse.term,
-						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-						sk->host, sk->port);
+					sk->appendResponse.term,
+					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+					sk->host, sk->port);

 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/* Another compute with higher term is running. */
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-				 sk->host, sk->port,
-				 sk->appendResponse.term, wp->propTerm);
+						sk->host, sk->port,
+						sk->appendResponse.term, wp->propTerm);
 		}

 		readAnything = true;
@@ -1493,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				 rf->currentClusterSize);
+						rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
@@ -1501,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->last_received_lsn));
+						LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
@@ -1509,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
@@ -1517,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1530,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
 				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					 rf->replytime, replyTimeStr);
+							rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
 			}
@@ -1700,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)

 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1740,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			 sk->port, FormatSafekeeperState(sk->state));
+					sk->port, FormatSafekeeperState(sk->state));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1816,8 +1816,8 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-			 sk->host, sk->port, FormatSafekeeperState(sk->state),
-			 wp->api.conn_error_message(sk));
+					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1863,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1902,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -2008,7 +2008,7 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2111,7 +2111,7 @@ FormatEvents(WalProposer *wp, uint32 events)
 	if (events & (~all_flags))
 	{
 		walprop_log(WARNING, "Event formatting found unexpected component %d",
-			 events & (~all_flags));
+					events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -356,7 +356,8 @@ typedef struct Safekeeper


 	/* postgres-specific fields */
-	#ifndef WALPROPOSER_LIB
+#ifndef WALPROPOSER_LIB
+
 	/*
 	 * postgres protocol connection to the WAL acceptor
 	 *
@@ -374,17 +375,18 @@ typedef struct Safekeeper
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
-	#endif
+#endif


 	/* WalProposer library specifics */
-	#ifdef WALPROPOSER_LIB
+#ifdef WALPROPOSER_LIB
+
 	/*
 	 * Buffer for incoming messages. Usually Rust vector is stored here.
 	 * Caller is responsible for freeing the buffer.
 	 */
 	StringInfoData inbuf;
-	#endif
+#endif
 } Safekeeper;

 /* Re-exported PostgresPollingStatusType */
@@ -472,7 +474,7 @@ typedef struct walproposer_api
 	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);

 	/* Start the connection, aka PQconnectStart. */
-	void (*conn_connect_start) (Safekeeper *sk);
+	void		(*conn_connect_start) (Safekeeper *sk);

 	/* Poll an asynchronous connection, aka PQconnectPoll. */
 	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
@@ -490,7 +492,7 @@ typedef struct walproposer_api
 	void		(*conn_finish) (Safekeeper *sk);

 	/*
-	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData. 
+	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData.
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
@@ -510,7 +512,7 @@ typedef struct walproposer_api
 	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
-	void (*wal_reader_allocate) (Safekeeper *sk);
+	void		(*wal_reader_allocate) (Safekeeper *sk);

 	/* Deallocate event set. */
 	void		(*free_event_set) (WalProposer *wp);
@@ -572,7 +574,7 @@ typedef struct walproposer_api
 	/*
 	 * Called right after the proposer was elected, but before it started
 	 * recovery and sent ProposerElected message to the safekeepers.
-	 * 
+	 *
 	 * Used by logical replication to update truncateLsn.
 	 */
 	void		(*after_election) (WalProposer *wp);
@@ -626,10 +628,10 @@ typedef struct WalProposerConfig
 	uint64		systemId;

 	/* Will be passed to safekeepers in greet request. */
-	TimeLineID  pgTimeline;
+	TimeLineID	pgTimeline;

 #ifdef WALPROPOSER_LIB
-	void *callback_data;
+	void	   *callback_data;
 #endif
 } WalProposerConfig;

@@ -710,10 +712,11 @@ extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);


-#define WPEVENT		1337	/* special log level for walproposer internal events */
+#define WPEVENT		1337		/* special log level for walproposer internal
+								 * events */

 #ifdef WALPROPOSER_LIB
-void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
+extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
 #define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
 #define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -9,8 +9,9 @@
 #include "utils/datetime.h"
 #include "miscadmin.h"

-void ExceptionalCondition(const char *conditionName,
-						  const char *fileName, int lineNumber)
+void
+ExceptionalCondition(const char *conditionName,
+					 const char *fileName, int lineNumber)
 {
 	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
 			fileName, lineNumber, conditionName);
@@ -169,17 +170,18 @@ timestamptz_to_str(TimestampTz t)

 bool
 TimestampDifferenceExceeds(TimestampTz start_time,
-								TimestampTz stop_time,
-								int msec)
+						   TimestampTz stop_time,
+						   int msec)
 {
 	TimestampTz diff = stop_time - start_time;
+
 	return (diff >= msec * INT64CONST(1000));
 }

 void
-WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
+WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
 {
-	char buf[1024];
+	char		buf[1024];
 	va_list		args;

 	fmt = _(fmt);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -637,8 +637,8 @@ walprop_connect_start(Safekeeper *sk)
 	 */
 	sk->conn = palloc(sizeof(WalProposerConn));
 	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
+	sk->conn->is_nonblocking = false;	/* connections always start in
+										 * blocking mode */
 	sk->conn->recvbuf = NULL;
 }

@@ -1291,10 +1291,11 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
 	/*
 	 * Apart from walproposer, basebackup LSN page is also written out by
 	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
-	 * here to avoid dummy page overwriting correct one we download here. Ugly,
-	 * but alternatives are about the same ugly. We won't need that if we switch
-	 * to on-demand WAL download from safekeepers, without writing to disk.
+	 * inherently dummy (only safekeepers have historic WAL). Update WAL
+	 * buffers here to avoid dummy page overwriting correct one we download
+	 * here. Ugly, but alternatives are about the same ugly. We won't need
+	 * that if we switch to on-demand WAL download from safekeepers, without
+	 * writing to disk.
 	 *
 	 * https://github.com/neondatabase/neon/issues/5749
 	 */
@@ -1681,17 +1682,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 static void
 walprop_pg_after_election(WalProposer *wp)
 {
-	FILE* f;
-	XLogRecPtr lrRestartLsn;
+	FILE	   *f;
+	XLogRecPtr	lrRestartLsn;

-	/* We don't need to do anything in syncSafekeepers mode.*/
+	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
 		return;

 	/*
-	 * If there are active logical replication subscription we need
-	 * to provide enough WAL for their WAL senders based on th position
-	 * of their replication slots.
+	 * If there are active logical replication subscription we need to provide
+	 * enough WAL for their WAL senders based on th position of their
+	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
@@ -1700,8 +1701,12 @@ walprop_pg_after_election(WalProposer *wp)
 		fclose(f);
 		if (lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
-			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
+			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+
+			/*
+			 * start from the beginning of the segment to fetch page headers
+			 * verifed by XLogReader
+			 */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]

 [[package]]
 name = "aiopg"
-version = "1.3.4"
+version = "1.4.0"
 description = "Postgres integration with asyncio."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
-    {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
+    {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
+    {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
 ]

 [package.dependencies]
 async-timeout = ">=3.0,<5.0"
-psycopg2-binary = ">=2.8.4"
+psycopg2-binary = ">=2.9.5"

 [package.extras]
 sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
@@ -160,64 +160,71 @@ pluggy = ">=0.4.0"

 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]

 [[package]]
 name = "asyncpg"
-version = "0.27.0"
+version = "0.29.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
-    {file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
-    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
-    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
-    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
-    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
-    {file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
-    {file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
-    {file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
-    {file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
-    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
-    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
-    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
-    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
-    {file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
-    {file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
-    {file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
-    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
-    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
-    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
-    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
-    {file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
-    {file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
-    {file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
-    {file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
-    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
-    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
-    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
-    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
-    {file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
-    {file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
-    {file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
+    {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
+    {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
+    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
+    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
+    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
+    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
+    {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
+    {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
+    {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
+    {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
+    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
+    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
+    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
+    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
+    {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
+    {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
+    {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
+    {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
+    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
+    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
+    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
+    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
+    {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
+    {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
+    {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
+    {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
+    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
+    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
+    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
+    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
+    {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
+    {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
+    {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
+    {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
+    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
+    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
+    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
+    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
+    {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
+    {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
+    {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
 ]

+[package.dependencies]
+async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
+
 [package.extras]
-dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
-docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
-test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
+docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
+test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]

 [[package]]
 name = "attrs"
@@ -2476,6 +2483,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2697,4 +2714,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
+content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

+[features]
+default = []
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
@@ -57,6 +61,7 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
+tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -69,13 +74,12 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
+postgres-protocol.workspace = true
 smol_str.workspace = true

 workspace_hack.workspace = true
-tokio-util.workspace = true

 [dev-dependencies]
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
-postgres-protocol.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,9 +3,11 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
+use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;

 use crate::auth::credentials::check_peer_addr_is_in_list;
+use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
@@ -24,31 +26,12 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
+use std::net::IpAddr;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};

-/// A product of successful authentication.
-pub struct AuthSuccess<T> {
-    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
-    pub reported_auth_ok: bool,
-    /// Something to be considered a positive result.
-    pub value: T,
-}
-
-impl<T> AuthSuccess<T> {
-    /// Very similar to [`std::option::Option::map`].
-    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
-    /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
-        AuthSuccess {
-            reported_auth_ok: self.reported_auth_ok,
-            value: f(self.value),
-        }
-    }
-}
-
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -61,9 +44,11 @@ pub enum BackendType<'a, T> {
    /// Current Cloud API (V2).
    Console(Cow<'a, console::provider::neon::Api>, T),
    /// Local mock of Cloud API (V2).
+    #[cfg(feature = "testing")]
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
+    #[cfg(test)]
    /// Test backend.
    Test(&'a dyn TestBackend),
 }
@@ -78,8 +63,10 @@ impl std::fmt::Display for BackendType<'_, ()> {
        use BackendType::*;
        match self {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
+            #[cfg(feature = "testing")]
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            #[cfg(test)]
            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
@@ -92,8 +79,10 @@ impl<T> BackendType<'_, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(Cow::Borrowed(c), x),
+            #[cfg(feature = "testing")]
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
+            #[cfg(test)]
            Test(x) => Test(*x),
        }
    }
@@ -107,8 +96,10 @@ impl<'a, T> BackendType<'a, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(c, f(x)),
+            #[cfg(feature = "testing")]
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
+            #[cfg(test)]
            Test(x) => Test(x),
        }
    }
@@ -121,51 +112,87 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
        use BackendType::*;
        match self {
            Console(c, x) => x.map(|x| Console(c, x)),
+            #[cfg(feature = "testing")]
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
+            #[cfg(test)]
            Test(x) => Ok(Test(x)),
        }
    }
 }

-pub enum ComputeCredentials {
+pub struct ComputeCredentials<T> {
+    pub info: ComputeUserInfo,
+    pub keys: T,
+}
+
+pub struct ComputeUserInfoNoEndpoint {
+    pub user: SmolStr,
+    pub peer_addr: IpAddr,
+    pub cache_key: SmolStr,
+}
+
+pub struct ComputeUserInfo {
+    pub endpoint: SmolStr,
+    pub inner: ComputeUserInfoNoEndpoint,
+}
+
+pub enum ComputeCredentialKeys {
+    #[cfg(feature = "testing")]
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
 }

+impl TryFrom<ClientCredentials> for ComputeUserInfo {
+    // user name
+    type Error = ComputeUserInfoNoEndpoint;
+
+    fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
+        let inner = ComputeUserInfoNoEndpoint {
+            user: creds.user,
+            peer_addr: creds.peer_addr,
+            cache_key: creds.cache_key,
+        };
+        match creds.project {
+            None => Err(inner),
+            Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
+        }
+    }
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks_creds(
+///
+/// All authentication flows will emit an AuthenticationOk message if successful.
+async fn auth_quirks(
    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
+    extra: &ConsoleReqExtra,
+    creds: ClientCredentials,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
-    let maybe_success = if creds.project.is_none() {
-        // Password will be checked by the compute node later.
-        Some(hacks::password_hack(creds, client, latency_timer).await?)
-    } else {
-        None
+    let (info, unauthenticated_password) = match creds.try_into() {
+        Err(info) => {
+            let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
+            (res.info, Some(res.keys))
+        }
+        Ok(info) => (info, None),
    };

-    // Password hack should set the project name.
-    // TODO: make `creds.project` more type-safe.
-    assert!(creds.project.is_some());
    info!("fetching user's authentication info");
    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
    let AuthInfo {
        secret,
        allowed_ips,
-    } = api.get_auth_info(extra, creds).await?;
+    } = api.get_auth_info(extra, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
+    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
    let secret = secret.unwrap_or_else(|| {
@@ -173,36 +200,49 @@ async fn auth_quirks_creds(
        // prevent malicious probing (possible due to missing protocol steps).
        // This mocked secret will never lead to successful authentication.
        info!("authentication info not found, mocking it");
-        AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
    });

-    if let Some(success) = maybe_success {
-        return Ok(success);
+    if let Some(password) = unauthenticated_password {
+        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let keys = match auth_outcome {
+            crate::sasl::Outcome::Success(key) => key,
+            crate::sasl::Outcome::Failure(reason) => {
+                info!("auth backend failed with an error: {reason}");
+                return Err(auth::AuthError::auth_failed(&*info.inner.user));
+            }
+        };
+
+        // we have authenticated the password
+        client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
+
+        return Ok(ComputeCredentials { info, keys });
    }

+    // -- the remaining flows are self-authenticating --
+
    // Perform cleartext auth if we're allowed to do that.
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
-        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(client, latency_timer).await;
+        return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(creds, client, config, latency_timer, secret).await
+    classic::authenticate(info, client, config, latency_timer, secret).await
 }

-/// True to its name, this function encapsulates our current auth trade-offs.
-/// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks(
+/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
+/// only if authentication was successfuly.
+async fn auth_and_wake_compute(
    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
+    extra: &ConsoleReqExtra,
+    creds: ClientCredentials,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let auth_stuff = auth_quirks_creds(
+) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
+    let compute_credentials = auth_quirks(
        api,
        extra,
        creds,
@@ -215,7 +255,7 @@ async fn auth_quirks(

    let mut num_retries = 0;
    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
+        let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -232,27 +272,27 @@ async fn auth_quirks(
        tokio::time::sleep(wait_duration).await;
    };

-    match auth_stuff.value {
-        ComputeCredentials::Password(password) => node.config.password(password),
-        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+    match compute_credentials.keys {
+        #[cfg(feature = "testing")]
+        ComputeCredentialKeys::Password(password) => node.config.password(password),
+        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
    };

-    Ok(AuthSuccess {
-        reported_auth_ok: auth_stuff.reported_auth_ok,
-        value: node,
-    })
+    Ok((node, compute_credentials.info))
 }

-impl BackendType<'_, ClientCredentials<'_>> {
+impl<'a> BackendType<'a, ClientCredentials> {
    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<String> {
+    pub fn get_endpoint(&self) -> Option<SmolStr> {
        use BackendType::*;

        match self {
            Console(_, creds) => creds.project.clone(),
+            #[cfg(feature = "testing")]
            Postgres(_, creds) => creds.project.clone(),
-            Link(_) => Some("link".to_owned()),
-            Test(_) => Some("test".to_owned()),
+            Link(_) => Some("link".into()),
+            #[cfg(test)]
+            Test(_) => Some("test".into()),
        }
    }

@@ -261,9 +301,11 @@ impl BackendType<'_, ClientCredentials<'_>> {
        use BackendType::*;

        match self {
-            Console(_, creds) => creds.user,
-            Postgres(_, creds) => creds.user,
+            Console(_, creds) => &creds.user,
+            #[cfg(feature = "testing")]
+            Postgres(_, creds) => &creds.user,
            Link(_) => "link",
+            #[cfg(test)]
            Test(_) => "test",
        }
    }
@@ -271,26 +313,25 @@ impl BackendType<'_, ClientCredentials<'_>> {
    /// Authenticate the client via the requested backend, possibly using credentials.
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
-        &mut self,
-        extra: &ConsoleReqExtra<'_>,
+        self,
+        extra: &ConsoleReqExtra,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
        latency_timer: &mut LatencyTimer,
-    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
        use BackendType::*;

        let res = match self {
            Console(api, creds) => {
                info!(
-                    user = creds.user,
+                    user = &*creds.user,
                    project = creds.project(),
                    "performing authentication using the console"
                );

-                let api = api.as_ref();
-                auth_quirks(
-                    api,
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
                    extra,
                    creds,
                    client,
@@ -298,18 +339,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    config,
                    latency_timer,
                )
-                .await?
+                .await?;
+                (cache_info, BackendType::Console(api, user_info))
            }
+            #[cfg(feature = "testing")]
            Postgres(api, creds) => {
                info!(
-                    user = creds.user,
+                    user = &*creds.user,
                    project = creds.project(),
                    "performing authentication using a local postgres instance"
                );

-                let api = api.as_ref();
-                auth_quirks(
-                    api,
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
                    extra,
                    creds,
                    client,
@@ -317,16 +359,21 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    config,
                    latency_timer,
                )
-                .await?
+                .await?;
+                (cache_info, BackendType::Postgres(api, user_info))
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                link::authenticate(url, client)
-                    .await?
-                    .map(CachedNodeInfo::new_uncached)
+                let node_info = link::authenticate(&url, client).await?;
+
+                (
+                    CachedNodeInfo::new_uncached(node_info),
+                    BackendType::Link(url),
+                )
            }
+            #[cfg(test)]
            Test(_) => {
                unreachable!("this function should never be called in the test backend")
            }
@@ -335,16 +382,20 @@ impl BackendType<'_, ClientCredentials<'_>> {
        info!("user successfully authenticated");
        Ok(res)
    }
+}

+impl BackendType<'_, ComputeUserInfo> {
    pub async fn get_allowed_ips(
        &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
        use BackendType::*;
        match self {
            Console(api, creds) => api.get_allowed_ips(extra, creds).await,
+            #[cfg(feature = "testing")]
            Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
            Link(_) => Ok(Arc::new(vec![])),
+            #[cfg(test)]
            Test(x) => x.get_allowed_ips(),
        }
    }
@@ -353,14 +404,16 @@ impl BackendType<'_, ClientCredentials<'_>> {
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
    pub async fn wake_compute(
        &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
        use BackendType::*;

        match self {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
+            #[cfg(feature = "testing")]
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
+            #[cfg(test)]
            Test(x) => x.wake_compute().map(Some),
        }
    }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,6 +1,6 @@
-use super::{AuthSuccess, ComputeCredentials};
+use super::{ComputeCredentials, ComputeUserInfo};
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, backend::ComputeCredentialKeys, AuthFlow},
    compute,
    config::AuthenticationConfig,
    console::AuthSecret,
@@ -12,14 +12,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    creds: &ClientCredentials<'_>,
+    creds: ComputeUserInfo,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
    secret: AuthSecret,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    let flow = AuthFlow::new(client);
    let scram_keys = match secret {
+        #[cfg(feature = "testing")]
        AuthSecret::Md5(_) => {
            info!("auth endpoint chooses MD5");
            return Err(auth::AuthError::bad_auth_method("MD5"));
@@ -53,7 +54,7 @@ pub(super) async fn authenticate(
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(creds.user));
+                    return Err(auth::AuthError::auth_failed(&*creds.inner.user));
                }
            };

@@ -64,9 +65,9 @@ pub(super) async fn authenticate(
        }
    };

-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+    Ok(ComputeCredentials {
+        info: creds,
+        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
            scram_keys,
        )),
    })
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,7 +1,11 @@
-use super::{AuthSuccess, ComputeCredentials};
+use super::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
+};
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, AuthFlow},
+    console::AuthSecret,
    proxy::LatencyTimer,
+    sasl,
    stream::{self, Stream},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -11,35 +15,42 @@ use tracing::{info, warn};
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
-pub async fn cleartext_hack(
+pub async fn authenticate_cleartext(
+    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+    secret: AuthSecret,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    warn!("cleartext auth flow override is enabled, proceeding");

    // pause the timer while we communicate with the client
    let _paused = latency_timer.pause();

-    let password = AuthFlow::new(client)
-        .begin(auth::CleartextPassword)
+    let auth_outcome = AuthFlow::new(client)
+        .begin(auth::CleartextPassword(secret))
        .await?
        .authenticate()
        .await?;

-    // Report tentative success; compute node will check the password anyway.
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::Password(password),
-    })
+    let keys = match auth_outcome {
+        sasl::Outcome::Success(key) => key,
+        sasl::Outcome::Failure(reason) => {
+            info!("auth backend failed with an error: {reason}");
+            return Err(auth::AuthError::auth_failed(&*info.inner.user));
+        }
+    };
+
+    Ok(ComputeCredentials { info, keys })
 }

 /// Workaround for clients which don't provide an endpoint (project) name.
-/// Very similar to [`cleartext_hack`], but there's a specific password format.
-pub async fn password_hack(
-    creds: &mut ClientCredentials<'_>,
+/// Similar to [`authenticate_cleartext`], but there's a specific password format,
+/// and passwords are not yet validated (we don't know how to validate them!)
+pub async fn password_hack_no_authentication(
+    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<Vec<u8>>> {
    warn!("project not specified, resorting to the password hack auth flow");

    // pause the timer while we communicate with the client
@@ -48,15 +59,17 @@ pub async fn password_hack(
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
        .await?
-        .authenticate()
+        .get_password()
        .await?;

-    info!(project = &payload.endpoint, "received missing parameter");
-    creds.project = Some(payload.endpoint);
+    info!(project = &*payload.endpoint, "received missing parameter");

    // Report tentative success; compute node will check the password anyway.
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::Password(payload.password),
+    Ok(ComputeCredentials {
+        info: ComputeUserInfo {
+            inner: info,
+            endpoint: payload.endpoint,
+        },
+        keys: payload.password,
    })
 }
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,4 +1,3 @@
-use super::AuthSuccess;
 use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
@@ -57,7 +56,7 @@ pub fn new_psql_session_id() -> String {
 pub(super) async fn authenticate(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<NodeInfo>> {
+) -> auth::Result<NodeInfo> {
    let psql_session_id = new_psql_session_id();
    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);
@@ -102,12 +101,9 @@ pub(super) async fn authenticate(
        config.password(password.as_ref());
    }

-    Ok(AuthSuccess {
-        reported_auth_ok: true,
-        value: NodeInfo {
-            config,
-            aux: db_info.aux,
-            allow_self_signed_compute: false, // caller may override
-        },
+    Ok(NodeInfo {
+        config,
+        aux: db_info.aux,
+        allow_self_signed_compute: false, // caller may override
    })
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -3,14 +3,12 @@
 use crate::{
    auth::password_hack::parse_endpoint_param,
    error::UserFacingError,
-    proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{
-    collections::HashSet,
-    net::{IpAddr, SocketAddr},
-};
+use smol_str::SmolStr;
+use std::{collections::HashSet, net::IpAddr};
 use thiserror::Error;
 use tracing::{info, warn};

@@ -24,7 +22,7 @@ pub enum ClientCredsParseError {
         SNI ('{}') and project option ('{}').",
        .domain, .option,
    )]
-    InconsistentProjectNames { domain: String, option: String },
+    InconsistentProjectNames { domain: SmolStr, option: SmolStr },

    #[error(
        "Common name inferred from SNI ('{}') is not known",
@@ -33,7 +31,7 @@ pub enum ClientCredsParseError {
    UnknownCommonName { cn: String },

    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(String),
+    MalformedProjectName(SmolStr),
 }

 impl UserFacingError for ClientCredsParseError {}
@@ -41,34 +39,34 @@ impl UserFacingError for ClientCredsParseError {}
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ClientCredentials<'a> {
-    pub user: &'a str,
+pub struct ClientCredentials {
+    pub user: SmolStr,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<String>,
+    pub project: Option<SmolStr>,

-    pub cache_key: String,
-    pub peer_addr: SocketAddr,
+    pub cache_key: SmolStr,
+    pub peer_addr: IpAddr,
 }

-impl ClientCredentials<'_> {
+impl ClientCredentials {
    #[inline]
    pub fn project(&self) -> Option<&str> {
        self.project.as_deref()
    }
 }

-impl<'a> ClientCredentials<'a> {
+impl ClientCredentials {
    pub fn parse(
-        params: &'a StartupMessageParams,
+        params: &StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<HashSet<String>>,
-        peer_addr: SocketAddr,
+        peer_addr: IpAddr,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

        // Some parameters are stored in the startup message.
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user = get_param("user")?;
+        let user = get_param("user")?.into();

        // Project name might be passed via PG's command-line options.
        let project_option = params
@@ -82,7 +80,7 @@ impl<'a> ClientCredentials<'a> {
                    .at_most_one()
                    .ok()?
            })
-            .map(|name| name.to_string());
+            .map(|name| name.into());

        let project_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
@@ -121,7 +119,7 @@ impl<'a> ClientCredentials<'a> {
        }
        .transpose()?;

-        info!(user, project = project.as_deref(), "credentials");
+        info!(%user, project = project.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
            NUM_CONNECTION_ACCEPTED_BY_SNI
@@ -142,8 +140,9 @@ impl<'a> ClientCredentials<'a> {
        let cache_key = format!(
            "{}{}",
            project.as_deref().unwrap_or(""),
-            neon_options(params).unwrap_or("".to_string())
-        );
+            neon_options_str(params)
+        )
+        .into();

        Ok(Self {
            user,
@@ -206,10 +205,10 @@ fn project_name_valid(name: &str) -> bool {
    name.chars().all(|c| c.is_alphanumeric() || c == '-')
 }

-fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
+fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
    sni.strip_suffix(common_name)?
        .strip_suffix('.')
-        .map(str::to_owned)
+        .map(SmolStr::from)
 }

 #[cfg(test)]
@@ -221,7 +220,7 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);
@@ -236,7 +235,7 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);
@@ -251,7 +250,7 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -267,7 +266,7 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -282,7 +281,7 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -300,7 +299,7 @@ mod tests {
            ),
        ]);

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());
@@ -315,7 +314,7 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());
@@ -330,7 +329,7 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -344,13 +343,13 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

@@ -365,7 +364,7 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
            .expect_err("should fail");
        match err {
@@ -384,7 +383,7 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
            .expect_err("should fail");
        match err {
@@ -404,13 +403,10 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("project"));
-        assert_eq!(
-            creds.cache_key,
-            "projectneon_endpoint_type:read_write neon_lsn:0/2"
-        );
+        assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");

        Ok(())
    }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,8 +1,9 @@
 //! Main authentication flow.

-use super::{AuthErrorImpl, PasswordHackPayload};
+use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
    config::TlsServerEndPoint,
+    console::AuthSecret,
    sasl, scram,
    stream::{PqStream, Stream},
 };
@@ -50,7 +51,7 @@ impl AuthMethod for PasswordHack {

 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword;
+pub struct CleartextPassword(pub AuthSecret);

 impl AuthMethod for CleartextPassword {
    #[inline(always)]
@@ -98,7 +99,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {

 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
    /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
+    pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
@@ -117,13 +118,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {

 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
    /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
+    pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        Ok(password.to_vec())
+        let outcome = validate_password_and_exchange(password, self.state.0)?;
+
+        if let sasl::Outcome::Success(_) = &outcome {
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        Ok(outcome)
    }
 }

@@ -152,6 +159,49 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
            ))
            .await?;

+        if let sasl::Outcome::Success(_) = &outcome {
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
        Ok(outcome)
    }
 }
+
+pub(super) fn validate_password_and_exchange(
+    password: &[u8],
+    secret: AuthSecret,
+) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
+    match secret {
+        #[cfg(feature = "testing")]
+        AuthSecret::Md5(_) => {
+            // test only
+            Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
+                password.to_owned(),
+            )))
+        }
+        // perform scram authentication as both client and server to validate the keys
+        AuthSecret::Scram(scram_secret) => {
+            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
+            let outcome = crate::scram::exchange(
+                &scram_secret,
+                sasl_client,
+                crate::config::TlsServerEndPoint::Undefined,
+            )?;
+
+            let client_key = match outcome {
+                sasl::Outcome::Success(client_key) => client_key,
+                sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
+            };
+
+            let keys = crate::compute::ScramKeys {
+                client_key: client_key.as_bytes(),
+                server_key: scram_secret.server_key.as_bytes(),
+            };
+
+            Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
+                tokio_postgres::config::AuthKeys::ScramSha256(keys),
+            )))
+        }
+    }
+}
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,9 +4,10 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.

 use bstr::ByteSlice;
+use smol_str::SmolStr;

 pub struct PasswordHackPayload {
-    pub endpoint: String,
+    pub endpoint: SmolStr,
    pub password: Vec<u8>,
 }

@@ -18,7 +19,7 @@ impl PasswordHackPayload {
            if let Some((endpoint, password)) = bytes.split_once_str(sep) {
                let endpoint = endpoint.to_str().ok()?;
                return Some(Self {
-                    endpoint: parse_endpoint_param(endpoint)?.to_owned(),
+                    endpoint: parse_endpoint_param(endpoint)?.into(),
                    password: password.to_owned(),
                });
            }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc};
 use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
+use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;

 use anyhow::{anyhow, bail, ensure, Context};
@@ -20,7 +21,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use utils::{project_git_version, sentry_init::init_sentry};

-use tracing::{error, info, warn, Instrument};
+use tracing::{error, info, Instrument};

 project_git_version!(GIT_VERSION);

@@ -151,63 +152,39 @@ async fn task_main(
    // will be inherited by all accepted client sockets.
    socket2::SockRef::from(&listener).set_keepalive(true)?;

-    let mut connections = tokio::task::JoinSet::new();
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();

-    loop {
-        tokio::select! {
-            accept_result = listener.accept() => {
-                let (socket, peer_addr) = accept_result?;
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;

-                let session_id = uuid::Uuid::new_v4();
-                let tls_config = Arc::clone(&tls_config);
-                let dest_suffix = Arc::clone(&dest_suffix);
+        let session_id = uuid::Uuid::new_v4();
+        let tls_config = Arc::clone(&tls_config);
+        let dest_suffix = Arc::clone(&dest_suffix);

-                connections.spawn(
-                    async move {
-                        socket
-                            .set_nodelay(true)
-                            .context("failed to set socket option")?;
+        connections.spawn(
+            async move {
+                socket
+                    .set_nodelay(true)
+                    .context("failed to set socket option")?;

-                        info!(%peer_addr, "serving");
-                        handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
-                    }
-                    .unwrap_or_else(|e| {
-                        // Acknowledge that the task has finished with an error.
-                        error!("per-client task finished with an error: {e:#}");
-                    })
-                    .instrument(tracing::info_span!("handle_client", ?session_id))
-                );
+                info!(%peer_addr, "serving");
+                handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
            }
-            // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
-            // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
-            // This only counts for this loop and it will be enabled again on next `select!`.
-            //
-            // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
-            // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
-            // not get called again, even if there are more connections to remove.
-            Some(res) = connections.join_next() => {
-                if let Err(e) = res {
-                    if !e.is_panic() && !e.is_cancelled() {
-                        warn!("unexpected error from joined connection task: {e:?}");
-                    }
-                }
-            }
-            _ = cancellation_token.cancelled() => {
-                drop(listener);
-                break;
-            }
-        }
+            .unwrap_or_else(|e| {
+                // Acknowledge that the task has finished with an error.
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(tracing::info_span!("handle_client", ?session_id)),
+        );
    }

-    // Drain connections
-    info!("waiting for all client connections to finish");
-    while let Some(res) = connections.join_next().await {
-        if let Err(e) = res {
-            if !e.is_panic() && !e.is_cancelled() {
-                warn!("unexpected error from joined connection task: {e:?}");
-            }
-        }
-    }
+    connections.close();
+    drop(listener);
+
+    connections.wait().await;
+
    info!("all client connections have finished");
    Ok(())
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -30,6 +30,7 @@ use clap::{Parser, ValueEnum};
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
    Console,
+    #[cfg(feature = "testing")]
    Postgres,
    Link,
 }
@@ -289,6 +290,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            let api = console::provider::neon::Api::new(endpoint, caches, locks);
            auth::BackendType::Console(Cow::Owned(api), ())
        }
+        #[cfg(feature = "testing")]
        AuthBackend::Postgres => {
            let url = args.auth_endpoint.parse()?;
            let api = console::provider::mock::Api::new(url);
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,6 @@
 use crate::{
    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, proxy::is_neon_param,
+    error::UserFacingError, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -275,7 +275,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
+        .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -1,9 +1,10 @@
+#[cfg(feature = "testing")]
 pub mod mock;
 pub mod neon;

 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::ClientCredentials,
+    auth::backend::ComputeUserInfo,
    cache::{timed_lru, TimedLru},
    compute, scram,
 };
@@ -195,16 +196,28 @@ pub mod errors {
 }

 /// Extra query params we'd like to pass to the console.
-pub struct ConsoleReqExtra<'a> {
+pub struct ConsoleReqExtra {
    /// A unique identifier for a connection.
    pub session_id: uuid::Uuid,
    /// Name of client application, if set.
-    pub application_name: Option<&'a str>,
-    pub options: Option<&'a str>,
+    pub application_name: String,
+    pub options: Vec<(String, String)>,
+}
+
+impl ConsoleReqExtra {
+    // https://swagger.io/docs/specification/serialization/ DeepObject format
+    // paramName[prop1]=value1&paramName[prop2]=value2&....
+    pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
+        self.options
+            .iter()
+            .map(|(k, v)| (format!("options[{}]", k), v.to_string()))
+            .collect()
+    }
 }

 /// Auth secret which is managed by the cloud.
 pub enum AuthSecret {
+    #[cfg(feature = "testing")]
    /// Md5 hash of user's password.
    Md5([u8; 16]),

@@ -246,21 +259,21 @@ pub trait Api {
    /// Get the client's auth secret for authentication.
    async fn get_auth_info(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<AuthInfo, errors::GetAuthInfoError>;

    async fn get_allowed_ips(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }

--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,7 +6,7 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use thiserror::Error;
@@ -47,7 +47,7 @@ impl Api {

    async fn do_get_auth_info(
        &self,
-        creds: &ClientCredentials<'_>,
+        creds: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        let (secret, allowed_ips) = async {
            // Perhaps we could persist this connection, but then we'd have to
@@ -60,7 +60,7 @@ impl Api {
            let secret = match get_execute_postgres_query(
                &client,
                "select rolpassword from pg_catalog.pg_authid where rolname = $1",
-                &[&creds.user],
+                &[&&*creds.inner.user],
                "rolpassword",
            )
            .await?
@@ -71,14 +71,14 @@ impl Api {
                    secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
                }
                None => {
-                    warn!("user '{}' does not exist", creds.user);
+                    warn!("user '{}' does not exist", creds.inner.user);
                    None
                }
            };
            let allowed_ips = match get_execute_postgres_query(
                &client,
                "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                &[&creds.project.clone().unwrap_or_default().as_str()],
+                &[&creds.endpoint.as_str()],
                "allowed_ips",
            )
            .await?
@@ -144,16 +144,16 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_auth_info(
        &self,
-        _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        _extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        self.do_get_auth_info(creds).await
    }

    async fn get_allowed_ips(
        &self,
-        _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        _extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
        Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
    }
@@ -161,8 +161,8 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        _extra: &ConsoleReqExtra<'_>,
-        _creds: &ClientCredentials,
+        _extra: &ConsoleReqExtra,
+        _creds: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute()
            .map_ok(CachedNodeInfo::new_uncached)
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -5,12 +5,8 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::{
-    auth::ClientCredentials,
-    compute, http,
-    proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
-    scram,
-};
+use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -52,8 +48,8 @@ impl Api {

    async fn do_get_auth_info(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        let request_id = uuid::Uuid::new_v4().to_string();
        async {
@@ -64,9 +60,9 @@ impl Api {
                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
-                    ("application_name", extra.application_name),
-                    ("project", Some(creds.project().expect("impossible"))),
-                    ("role", Some(creds.user)),
+                    ("application_name", extra.application_name.as_str()),
+                    ("project", creds.endpoint.as_str()),
+                    ("role", creds.inner.user.as_str()),
                ])
                .build()?;

@@ -105,24 +101,28 @@ impl Api {

    async fn do_wake_compute(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<NodeInfo, WakeComputeError> {
-        let project = creds.project().expect("impossible");
        let request_id = uuid::Uuid::new_v4().to_string();
        async {
-            let request = self
+            let mut request_builder = self
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
-                    ("application_name", extra.application_name),
-                    ("project", Some(project)),
-                    ("options", extra.options),
-                ])
-                .build()?;
+                    ("application_name", extra.application_name.as_str()),
+                    ("project", creds.endpoint.as_str()),
+                ]);
+
+            request_builder = if extra.options.is_empty() {
+                request_builder
+            } else {
+                request_builder.query(&extra.options_as_deep_object())
+            };
+            let request = request_builder.build()?;

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
@@ -161,18 +161,18 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_auth_info(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        self.do_get_auth_info(extra, creds).await
    }

    async fn get_allowed_ips(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = creds.project().expect("impossible");
+        let key: &str = &creds.endpoint;
        if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
            ALLOWED_IPS_BY_CACHE_OUTCOME
                .with_label_values(&["hit"])
@@ -192,10 +192,10 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        extra: &ConsoleReqExtra,
+        creds: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key: &str = &creds.cache_key;
+        let key: &str = &creds.inner.cache_key;

        // Every time we do a wakeup http request, the compute node will stay up
        // for some time (highly depends on the console's scale-to-zero policy);
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,7 +2,7 @@
 mod tests;

 use crate::{
-    auth::{self, backend::AuthSuccess},
+    auth,
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
@@ -24,7 +24,7 @@ use prometheus::{
    IntGaugeVec,
 };
 use regex::Regex;
-use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant};
+use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
    time,
@@ -277,6 +277,21 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

+pub async fn run_until_cancelled<F: std::future::Future>(
+    f: F,
+    cancellation_token: &CancellationToken,
+) -> Option<F::Output> {
+    match futures::future::select(
+        std::pin::pin!(f),
+        std::pin::pin!(cancellation_token.cancelled()),
+    )
+    .await
+    {
+        futures::future::Either::Left((f, _)) => Some(f),
+        futures::future::Either::Right(((), _)) => None,
+    }
+}
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    listener: tokio::net::TcpListener,
@@ -290,71 +305,62 @@ pub async fn task_main(
    // will be inherited by all accepted client sockets.
    socket2::SockRef::from(&listener).set_keepalive(true)?;

-    let mut connections = tokio::task::JoinSet::new();
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    let cancel_map = Arc::new(CancelMap::default());

-    loop {
-        tokio::select! {
-            accept_result = listener.accept() => {
-                let (socket, peer_addr) = accept_result?;
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;

-                let session_id = uuid::Uuid::new_v4();
-                let cancel_map = Arc::clone(&cancel_map);
-                connections.spawn(
-                    async move {
-                        info!("accepted postgres client connection");
+        let session_id = uuid::Uuid::new_v4();
+        let cancel_map = Arc::clone(&cancel_map);
+        connections.spawn(
+            async move {
+                info!("accepted postgres client connection");

-                        let mut socket = WithClientIp::new(socket);
-                        let mut peer_addr = peer_addr;
-                        if let Some(ip) = socket.wait_for_addr().await? {
-                            peer_addr = ip;
-                            tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
-                        } else if config.require_client_ip {
-                            bail!("missing required client IP");
-                        }
-
-                        socket
-                            .inner
-                            .set_nodelay(true)
-                            .context("failed to set socket option")?;
-
-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await
-                    }
-                    .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
-                    .unwrap_or_else(move |e| {
-                        // Acknowledge that the task has finished with an error.
-                        error!(?session_id, "per-client task finished with an error: {e:#}");
-                    }),
-                );
-            }
-            // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
-            // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
-            // This only counts for this loop and it will be enabled again on next `select!`.
-            //
-            // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
-            // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
-            // not get called again, even if there are more connections to remove.
-            Some(res) = connections.join_next() => {
-                if let Err(e) = res {
-                    if !e.is_panic() && !e.is_cancelled() {
-                        warn!("unexpected error from joined connection task: {e:?}");
-                    }
+                let mut socket = WithClientIp::new(socket);
+                let mut peer_addr = peer_addr;
+                if let Some(ip) = socket.wait_for_addr().await? {
+                    peer_addr = ip;
+                    tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
+                } else if config.require_client_ip {
+                    bail!("missing required client IP");
                }
+
+                socket
+                    .inner
+                    .set_nodelay(true)
+                    .context("failed to set socket option")?;
+
+                handle_client(
+                    config,
+                    &cancel_map,
+                    session_id,
+                    socket,
+                    ClientMode::Tcp,
+                    peer_addr.ip(),
+                )
+                .await
            }
-            _ = cancellation_token.cancelled() => {
-                drop(listener);
-                break;
-            }
-        }
+            .instrument(info_span!(
+                "handle_client",
+                ?session_id,
+                peer_addr = tracing::field::Empty
+            ))
+            .unwrap_or_else(move |e| {
+                // Acknowledge that the task has finished with an error.
+                error!(?session_id, "per-client task finished with an error: {e:#}");
+            }),
+        );
    }
+
+    connections.close();
+    drop(listener);
+
    // Drain connections
-    while let Some(res) = connections.join_next().await {
-        if let Err(e) = res {
-            if !e.is_panic() && !e.is_cancelled() {
-                warn!("unexpected error from joined connection task: {e:?}");
-            }
-        }
-    }
+    connections.wait().await;
+
    Ok(())
 }

@@ -408,7 +414,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    session_id: uuid::Uuid,
    stream: S,
    mode: ClientMode,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<()> {
    info!(
        protocol = mode.protocol_label(),
@@ -665,8 +671,8 @@ fn report_error(e: &WakeComputeError, retry: bool) {
 pub async fn connect_to_compute<M: ConnectMechanism>(
    mechanism: &M,
    mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+    extra: &console::ConsoleReqExtra,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
    mut latency_timer: LatencyTimer,
 ) -> Result<M::Connection, M::Error>
 where
@@ -696,10 +702,12 @@ where
    let node_info = loop {
        let wake_res = match creds {
            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
            // nothing to do?
            auth::BackendType::Link(_) => return Err(err.into()),
            // test backend
+            #[cfg(test)]
            auth::BackendType::Test(x) => x.wake_compute(),
        };

@@ -838,7 +846,6 @@ pub fn retry_after(num_retries: u32) -> time::Duration {
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
    node: &compute::PostgresConnection,
-    reported_auth_ok: bool,
    session: cancellation::Session<'_>,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
@@ -846,13 +853,6 @@ async fn prepare_client_connection(
    // The new token (cancel_key_data) will be sent to the client.
    let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());

-    // Report authentication success if we haven't done this already.
-    // Note that we do this only (for the most part) after we've connected
-    // to a compute (see above) which performs its own authentication.
-    if !reported_auth_ok {
-        stream.write_message_noflush(&Be::AuthenticationOk)?;
-    }
-
    // Forward all postgres connection params to the client.
    // Right now the implementation is very hacky and inefficent (ideally,
    // we don't need an intermediate hashmap), but at least it should be correct.
@@ -921,7 +921,7 @@ struct Client<'a, S> {
    /// The underlying libpq protocol stream.
    stream: PqStream<Stream<S>>,
    /// Client credentials that we care about.
-    creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
+    creds: auth::BackendType<'a, auth::ClientCredentials>,
    /// KV-dictionary with PostgreSQL connection params.
    params: &'a StartupMessageParams,
    /// Unique connection ID.
@@ -934,7 +934,7 @@ impl<'a, S> Client<'a, S> {
    /// Construct a new connection context.
    fn new(
        stream: PqStream<Stream<S>>,
-        creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
+        creds: auth::BackendType<'a, auth::ClientCredentials>,
        params: &'a StartupMessageParams,
        session_id: uuid::Uuid,
        allow_self_signed_compute: bool,
@@ -953,7 +953,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    /// Let the client authenticate and connect to the designated compute node.
    // Instrumentation logs endpoint name everywhere. Doesn't work for link
    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)]
+    #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
    async fn connect_to_db(
        self,
        session: cancellation::Session<'_>,
@@ -962,22 +962,25 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
-            mut creds,
+            creds,
            params,
            session_id,
            allow_self_signed_compute,
        } = self;

-        let console_options = neon_options(params);
-
+        let proto = mode.protocol_label();
        let extra = console::ConsoleReqExtra {
            session_id, // aka this connection's id
-            application_name: params.get("application_name"),
-            options: console_options.as_deref(),
+            application_name: format!(
+                "{}/{}",
+                params.get("application_name").unwrap_or_default(),
+                proto
+            ),
+            options: neon_options(params),
        };
+        let mut latency_timer = LatencyTimer::new(proto);

-        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
-
+        let user = creds.get_user().to_owned();
        let auth_result = match creds
            .authenticate(
                &extra,
@@ -990,7 +993,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        {
            Ok(auth_result) => auth_result,
            Err(e) => {
-                let user = creds.get_user();
                let db = params.get("database");
                let app = params.get("application_name");
                let params_span = tracing::info_span!("", ?user, ?db, ?app);
@@ -999,10 +1001,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            }
        };

-        let AuthSuccess {
-            reported_auth_ok,
-            value: mut node_info,
-        } = auth_result;
+        let (mut node_info, creds) = auth_result;

        node_info.allow_self_signed_compute = allow_self_signed_compute;

@@ -1017,7 +1016,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        .or_else(|e| stream.throw_error(e))
        .await?;

-        let proto = mode.protocol_label();
        NUM_DB_CONNECTIONS_OPENED_COUNTER
            .with_label_values(&[proto])
            .inc();
@@ -1025,7 +1023,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
        }

-        prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
+        prepare_client_connection(&node, session, &mut stream).await?;
        // Before proxy passing, forward to compute whatever data is left in the
        // PqStream input buffer. Normally there is none, but our serverless npm
        // driver in pipeline mode sends startup, password and first query
@@ -1036,26 +1034,29 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    }
 }

-pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
+pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> {
    #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
-        .filter(|opt| is_neon_param(opt))
-        .sorted() // we sort it to use as cache key
-        .intersperse(" ") // TODO: use impl from std once it's stabilized
-        .collect();
-
-    // Don't even bother with empty options.
-    if options.is_empty() {
-        return None;
+    match params.options_raw() {
+        Some(options) => options.filter_map(neon_option).collect(),
+        None => vec![],
    }
-
-    Some(options)
 }

-pub fn is_neon_param(bytes: &str) -> bool {
+pub fn neon_options_str(params: &StartupMessageParams) -> String {
+    #[allow(unstable_name_collisions)]
+    neon_options(params)
+        .iter()
+        .map(|(k, v)| format!("{}:{}", k, v))
+        .sorted() // we sort it to use as cache key
+        .intersperse(" ".to_owned())
+        .collect()
+}
+
+pub fn neon_option(bytes: &str) -> Option<(String, String)> {
    static RE: OnceCell<Regex> = OnceCell::new();
-    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());

-    RE.get().unwrap().is_match(bytes)
+    let cap = re.captures(bytes)?;
+    let (_, [k, v]) = cap.extract();
+    Some((k.to_owned(), v.to_owned()))
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -3,8 +3,7 @@
 mod mitm;

 use super::*;
-use crate::auth::backend::TestBackend;
-use crate::auth::ClientCredentials;
+use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
 use crate::{auth, http, sasl, scram};
@@ -109,8 +108,9 @@ fn generate_tls_config<'a>(
 trait TestAuth: Sized {
    async fn authenticate<S: AsyncRead + AsyncWrite + Unpin + Send>(
        self,
-        _stream: &mut PqStream<Stream<S>>,
+        stream: &mut PqStream<Stream<S>>,
    ) -> anyhow::Result<()> {
+        stream.write_message_noflush(&Be::AuthenticationOk)?;
        Ok(())
    }
 }
@@ -168,7 +168,6 @@ async fn dummy_proxy(
    auth.authenticate(&mut stream).await?;

    stream
-        .write_message_noflush(&Be::AuthenticationOk)?
        .write_message_noflush(&Be::CLIENT_ENCODING)?
        .write_message(&Be::ReadyForQuery)
        .await?;
@@ -485,14 +484,14 @@ fn helper_create_connect_info(
    mechanism: &TestConnectMechanism,
 ) -> (
    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'_, ClientCredentials<'static>>,
+    console::ConsoleReqExtra,
+    auth::BackendType<'_, ComputeUserInfo>,
 ) {
    let cache = helper_create_cached_node_info();
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
-        application_name: Some("TEST"),
-        options: None,
+        application_name: "TEST".into(),
+        options: vec![],
    };
    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -30,6 +30,9 @@ pub enum Error {
    #[error("Bad client message: {0}")]
    BadClientMessage(&'static str),

+    #[error("Internal error: missing digest")]
+    MissingBinding,
+
    #[error(transparent)]
    Io(#[from] io::Error),
 }
@@ -38,8 +41,7 @@ impl UserFacingError for Error {
    fn to_string_client(&self) -> String {
        use Error::*;
        match self {
-            // TODO: add support for channel binding
-            ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(),
+            ChannelBindingFailed(m) => m.to_string(),
            ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
            _ => "authentication protocol violation".to_string(),
        }
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -15,7 +15,7 @@ mod signature;
 #[cfg(any(test, doc))]
 mod password;

-pub use exchange::Exchange;
+pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
 pub use secret::ServerSecret;

--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -1,5 +1,9 @@
 //! Implementation of the SCRAM authentication algorithm.

+use std::convert::Infallible;
+
+use postgres_protocol::authentication::sasl::ScramSha256;
+
 use super::messages::{
    ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
@@ -29,22 +33,27 @@ impl std::str::FromStr for TlsServerEndPoint {
    }
 }

+struct SaslSentInner {
+    cbind_flag: ChannelBinding<TlsServerEndPoint>,
+    client_first_message_bare: String,
+    server_first_message: OwnedServerFirstMessage,
+}
+
+struct SaslInitial {
+    nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
+}
+
 enum ExchangeState {
    /// Waiting for [`ClientFirstMessage`].
-    Initial,
+    Initial(SaslInitial),
    /// Waiting for [`ClientFinalMessage`].
-    SaltSent {
-        cbind_flag: ChannelBinding<TlsServerEndPoint>,
-        client_first_message_bare: String,
-        server_first_message: OwnedServerFirstMessage,
-    },
+    SaltSent(SaslSentInner),
 }

 /// Server's side of SCRAM auth algorithm.
 pub struct Exchange<'a> {
    state: ExchangeState,
    secret: &'a ServerSecret,
-    nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
    tls_server_end_point: config::TlsServerEndPoint,
 }

@@ -55,90 +64,160 @@ impl<'a> Exchange<'a> {
        tls_server_end_point: config::TlsServerEndPoint,
    ) -> Self {
        Self {
-            state: ExchangeState::Initial,
+            state: ExchangeState::Initial(SaslInitial { nonce }),
            secret,
-            nonce,
            tls_server_end_point,
        }
    }
 }

+pub fn exchange(
+    secret: &ServerSecret,
+    mut client: ScramSha256,
+    tls_server_end_point: config::TlsServerEndPoint,
+) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
+    use sasl::Step::*;
+
+    let init = SaslInitial {
+        nonce: rand::random,
+    };
+
+    let client_first = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
+        Continue(sent, server_first) => {
+            client.update(server_first.as_bytes())?;
+            sent
+        }
+        Success(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    let client_final = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
+        Success(keys, server_final) => {
+            client.finish(server_final.as_bytes())?;
+            keys
+        }
+        Continue(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    Ok(sasl::Outcome::Success(keys))
+}
+
+impl SaslInitial {
+    fn transition(
+        &self,
+        secret: &ServerSecret,
+        tls_server_end_point: &config::TlsServerEndPoint,
+        input: &str,
+    ) -> sasl::Result<sasl::Step<SaslSentInner, Infallible>> {
+        let client_first_message = ClientFirstMessage::parse(input)
+            .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
+
+        // If the flag is set to "y" and the server supports channel
+        // binding, the server MUST fail authentication
+        if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
+            && tls_server_end_point.supported()
+        {
+            return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
+        }
+
+        let server_first_message = client_first_message.build_server_first_message(
+            &(self.nonce)(),
+            &secret.salt_base64,
+            secret.iterations,
+        );
+        let msg = server_first_message.as_str().to_owned();
+
+        let next = SaslSentInner {
+            cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
+            client_first_message_bare: client_first_message.bare.to_owned(),
+            server_first_message,
+        };
+
+        Ok(sasl::Step::Continue(next, msg))
+    }
+}
+
+impl SaslSentInner {
+    fn transition(
+        &self,
+        secret: &ServerSecret,
+        tls_server_end_point: &config::TlsServerEndPoint,
+        input: &str,
+    ) -> sasl::Result<sasl::Step<Infallible, super::ScramKey>> {
+        let Self {
+            cbind_flag,
+            client_first_message_bare,
+            server_first_message,
+        } = self;
+
+        let client_final_message = ClientFinalMessage::parse(input)
+            .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
+
+        let channel_binding = cbind_flag.encode(|_| match tls_server_end_point {
+            config::TlsServerEndPoint::Sha256(x) => Ok(x),
+            config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
+        })?;
+
+        // This might've been caused by a MITM attack
+        if client_final_message.channel_binding != channel_binding {
+            return Err(SaslError::ChannelBindingFailed(
+                "insecure connection: secure channel data mismatch",
+            ));
+        }
+
+        if client_final_message.nonce != server_first_message.nonce() {
+            return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
+        }
+
+        let signature_builder = SignatureBuilder {
+            client_first_message_bare,
+            server_first_message: server_first_message.as_str(),
+            client_final_message_without_proof: client_final_message.without_proof,
+        };
+
+        let client_key = signature_builder
+            .build(&secret.stored_key)
+            .derive_client_key(&client_final_message.proof);
+
+        // Auth fails either if keys don't match or it's pre-determined to fail.
+        if client_key.sha256() != secret.stored_key || secret.doomed {
+            return Ok(sasl::Step::Failure("password doesn't match"));
+        }
+
+        let msg =
+            client_final_message.build_server_final_message(signature_builder, &secret.server_key);
+
+        Ok(sasl::Step::Success(client_key, msg))
+    }
+}
+
 impl sasl::Mechanism for Exchange<'_> {
    type Output = super::ScramKey;

    fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
        use {sasl::Step::*, ExchangeState::*};
        match &self.state {
-            Initial => {
-                let client_first_message = ClientFirstMessage::parse(input)
-                    .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
-
-                // If the flag is set to "y" and the server supports channel
-                // binding, the server MUST fail authentication
-                if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
-                    && self.tls_server_end_point.supported()
-                {
-                    return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
-                }
-
-                let server_first_message = client_first_message.build_server_first_message(
-                    &(self.nonce)(),
-                    &self.secret.salt_base64,
-                    self.secret.iterations,
-                );
-                let msg = server_first_message.as_str().to_owned();
-
-                self.state = SaltSent {
-                    cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
-                    client_first_message_bare: client_first_message.bare.to_owned(),
-                    server_first_message,
-                };
-
-                Ok(Continue(self, msg))
-            }
-            SaltSent {
-                cbind_flag,
-                client_first_message_bare,
-                server_first_message,
-            } => {
-                let client_final_message = ClientFinalMessage::parse(input)
-                    .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
-
-                let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
-                    config::TlsServerEndPoint::Sha256(x) => Ok(x),
-                    config::TlsServerEndPoint::Undefined => {
-                        Err(SaslError::ChannelBindingFailed("no cert digest provided"))
+            Initial(init) => {
+                match init.transition(self.secret, &self.tls_server_end_point, input)? {
+                    Continue(sent, msg) => {
+                        self.state = SaltSent(sent);
+                        Ok(Continue(self, msg))
                    }
-                })?;
-
-                // This might've been caused by a MITM attack
-                if client_final_message.channel_binding != channel_binding {
-                    return Err(SaslError::ChannelBindingFailed("data mismatch"));
+                    Success(x, _) => match x {},
+                    Failure(msg) => Ok(Failure(msg)),
                }
-
-                if client_final_message.nonce != server_first_message.nonce() {
-                    return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
+            }
+            SaltSent(sent) => {
+                match sent.transition(self.secret, &self.tls_server_end_point, input)? {
+                    Success(keys, msg) => Ok(Success(keys, msg)),
+                    Continue(x, _) => match x {},
+                    Failure(msg) => Ok(Failure(msg)),
                }
-
-                let signature_builder = SignatureBuilder {
-                    client_first_message_bare,
-                    server_first_message: server_first_message.as_str(),
-                    client_final_message_without_proof: client_final_message.without_proof,
-                };
-
-                let client_key = signature_builder
-                    .build(&self.secret.stored_key)
-                    .derive_client_key(&client_final_message.proof);
-
-                // Auth fails either if keys don't match or it's pre-determined to fail.
-                if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
-                    return Ok(Failure("password doesn't match"));
-                }
-
-                let msg = client_final_message
-                    .build_server_final_message(signature_builder, &self.secret.server_key);
-
-                Ok(Success(client_key, msg))
            }
        }
    }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ use anyhow::bail;
 use hyper::StatusCode;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio_util::task::TaskTracker;

 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
@@ -23,7 +24,7 @@ use hyper::{
    Body, Method, Request, Response,
 };

-use std::net::SocketAddr;
+use std::net::IpAddr;
 use std::task::Poll;
 use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
@@ -70,6 +71,9 @@ pub async fn task_main(
        incoming: addr_incoming,
    };

+    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
+    ws_connections.close(); // allows `ws_connections.wait to complete`
+
    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
        if let Err(err) = conn {
            error!("failed to accept TLS connection for websockets: {err:?}");
@@ -86,6 +90,7 @@ pub async fn task_main(
            let remote_addr = io.inner.remote_addr();
            let sni_name = tls.server_name().map(|s| s.to_string());
            let conn_pool = conn_pool.clone();
+            let ws_connections = ws_connections.clone();

            async move {
                let peer_addr = match client_addr {
@@ -97,13 +102,21 @@ pub async fn task_main(
                    move |req: Request<Body>| {
                        let sni_name = sni_name.clone();
                        let conn_pool = conn_pool.clone();
+                        let ws_connections = ws_connections.clone();

                        async move {
                            let cancel_map = Arc::new(CancelMap::default());
                            let session_id = uuid::Uuid::new_v4();

                            request_handler(
-                                req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr,
+                                req,
+                                config,
+                                conn_pool,
+                                ws_connections,
+                                cancel_map,
+                                session_id,
+                                sni_name,
+                                peer_addr.ip(),
                            )
                            .instrument(info_span!(
                                "serverless",
@@ -123,6 +136,9 @@ pub async fn task_main(
        .with_graceful_shutdown(cancellation_token.cancelled())
        .await?;

+    // await websocket connections
+    ws_connections.wait().await;
+
    Ok(())
 }

@@ -164,14 +180,16 @@ where
    }
 }

+#[allow(clippy::too_many_arguments)]
 async fn request_handler(
    mut request: Request<Body>,
    config: &'static ProxyConfig,
    conn_pool: Arc<conn_pool::GlobalConnPool>,
+    ws_connections: TaskTracker,
    cancel_map: Arc<CancelMap>,
    session_id: uuid::Uuid,
    sni_hostname: Option<String>,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> Result<Response<Body>, ApiError> {
    let host = request
        .headers()
@@ -187,7 +205,7 @@ async fn request_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::spawn(
+        ws_connections.spawn(
            async move {
                if let Err(e) = websocket::serve_websocket(
                    websocket,
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,4 +1,4 @@
-use anyhow::Context;
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
 use futures::future::poll_fn;
@@ -9,7 +9,7 @@ use pbkdf2::{
 };
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::SocketAddr, sync::Arc};
+use std::{collections::HashMap, net::IpAddr, sync::Arc};
 use std::{
    fmt,
    task::{ready, Poll},
@@ -22,7 +22,7 @@ use tokio::time;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
-    auth::{self, check_peer_addr_is_in_list},
+    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
    console,
    proxy::{
        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
@@ -37,7 +37,7 @@ use crate::proxy::ConnectMechanism;
 use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};

-pub const APP_NAME: &str = "sql_over_http";
+pub const APP_NAME: &str = "/sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;

 #[derive(Debug, Clone)]
@@ -146,7 +146,7 @@ impl GlobalConnPool {
        conn_info: &ConnInfo,
        force_new: bool,
        session_id: uuid::Uuid,
-        peer_addr: SocketAddr,
+        peer_addr: IpAddr,
    ) -> anyhow::Result<Client> {
        let mut client: Option<ClientInner> = None;
        let mut latency_timer = LatencyTimer::new("http");
@@ -406,7 +406,7 @@ async fn connect_to_compute(
    conn_id: uuid::Uuid,
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<ClientInner> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -423,19 +423,22 @@ async fn connect_to_compute(
        common_names,
        peer_addr,
    )?;
+
+    let creds =
+        ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
    let backend = config.auth_backend.as_ref().map(|_| creds);

    let console_options = neon_options(&params);

    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
-        application_name: Some(APP_NAME),
-        options: console_options.as_deref(),
+        application_name: APP_NAME.to_string(),
+        options: console_options,
    };
    // TODO(anna): this is a bit hacky way, consider using console notification listener.
    if !config.disable_ip_check_for_http {
        let allowed_ips = backend.get_allowed_ips(&extra).await?;
-        if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) {
+        if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
            return Err(auth::AuthError::ip_address_not_allowed().into());
        }
    }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,4 +1,4 @@
-use std::net::SocketAddr;
+use std::net::IpAddr;
 use std::sync::Arc;

 use anyhow::bail;
@@ -202,7 +202,7 @@ pub async fn handle(
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
    config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
    let result = tokio::time::timeout(
@@ -301,7 +301,7 @@ async fn handle_inner(
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<Response<Body>> {
    NUM_CONNECTIONS_ACCEPTED_COUNTER
        .with_label_values(&["http"])
--- a/Show More
+++ b/Show More