compaction_level0_phase1: change default streaming-kmerge without validation

After this PR is merged, deployed, and guaranteed to not be rolled back, we can remove the field from `pageserver.toml`s. refs https://github.com/neondatabase/neon/issues/8184
2026-05-28 10:30:40 +00:00 · 2024-09-05 17:09:32 +02:00
113 changed files with 1044 additions and 2304 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,13 +7,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - AZURE_DEV_CLIENT_ID
-  - AZURE_DEV_REGISTRY_NAME
-  - AZURE_DEV_SUBSCRIPTION_ID
-  - AZURE_PROD_CLIENT_ID
-  - AZURE_PROD_REGISTRY_NAME
-  - AZURE_PROD_SUBSCRIPTION_ID
-  - AZURE_TENANT_ID
  - BENCHMARK_PROJECT_ID_PUB
  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -1,56 +0,0 @@
-name: Push images to ACR
-on:
-  workflow_call:
-    inputs:
-      client_id:
-        description: Client ID of Azure managed identity or Entra app
-        required: true
-        type: string
-      image_tag:
-        description: Tag for the container image
-        required: true
-        type: string
-      images:
-        description: Images to push
-        required: true
-        type: string
-      registry_name:
-        description: Name of the container registry
-        required: true
-        type: string
-      subscription_id:
-        description: Azure subscription ID
-        required: true
-        type: string
-      tenant_id:
-        description: Azure tenant ID
-        required: true
-        type: string
-
-jobs:
-  push-to-acr:
-    runs-on: ubuntu-22.04
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
-
-    steps:
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ inputs.client_id }}
-          subscription-id: ${{ inputs.subscription_id }}
-          tenant-id: ${{ inputs.tenant_id }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=${{ inputs.registry_name }}
-
-      - name: Copy docker images to ACR ${{ inputs.registry_name }}
-        run: |
-          images='${{ inputs.images }}'
-          for image in ${images}; do
-            docker buildx imagetools create \
-              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
-          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,6 +794,9 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -820,6 +823,28 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

+      - name: Azure login
+        if: github.ref_name == 'main'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: github.ref_name == 'main'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Copy docker images to ACR-dev
+        if: github.ref_name == 'main'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create \
+              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
+                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
@@ -857,30 +882,6 @@ jobs:
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done

-  push-to-acr-dev:
-    if: github.ref_name == 'main'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
-  push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
    runs-on: ubuntu-22.04
@@ -956,8 +957,8 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,11 +7,6 @@ on:
  pull_request_target:
    types:
      - opened
-  workflow_dispatch:
-    inputs:
-      github-actor:
-        description: 'GitHub username. If empty, the username of the current user will be used'
-        required: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -31,31 +26,12 @@ jobs:
      id: check-user
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        ACTOR: ${{ inputs.github-actor || github.actor }}
      run: |
-        expected_error="User does not exist or is not a member of the organization"
-        output_file=output.txt
-
-        for i in $(seq 1 10); do
-          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
-
-            is_member=true
-            break
-          elif grep -q "${expected_error}" ${output_file}; then
-            is_member=false
-            break
-          elif [ $i -eq 10 ]; then
-            title="Failed to get memmbership status for ${ACTOR}"
-            message="The latest GitHub API error message: '$(cat ${output_file})'"
-            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
-
-            exit 1
-          fi
-
-          sleep 1
-        done
+        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
+          is_member=true
+        else
+          is_member=false
+        fi

        echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,22 +915,25 @@ dependencies = [

 [[package]]
 name = "bindgen"
-version = "0.70.1"
+version = "0.65.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
+checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 1.3.2",
 "cexpr",
 "clang-sys",
- "itertools 0.12.1",
+ "lazy_static",
+ "lazycell",
 "log",
- "prettyplease 0.2.17",
+ "peeking_take_while",
+ "prettyplease 0.2.6",
 "proc-macro2",
 "quote",
 "regex",
 "rustc-hash",
 "shlex",
 "syn 2.0.52",
+ "which",
 ]

 [[package]]
@@ -2946,6 +2949,12 @@ dependencies = [
 "spin 0.5.2",
 ]

+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3968,6 +3977,12 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4265,9 +4280,9 @@ dependencies = [

 [[package]]
 name = "prettyplease"
-version = "0.2.17"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
+checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
 "syn 2.0.52",
@@ -6079,9 +6094,8 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"

 [[package]]
 name = "svg_fmt"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
+version = "0.4.2"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"

 [[package]]
 name = "syn"
@@ -7613,7 +7627,6 @@ dependencies = [
 "hyper 0.14.26",
 "indexmap 1.9.3",
 "itertools 0.10.5",
- "itertools 0.12.1",
 "lazy_static",
 "libc",
 "log",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.70"
+bindgen = "0.65"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
@@ -161,7 +161,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.3"
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
--- a/1
+++ b/1
@@ -87,7 +87,6 @@ RUN mkdir -p /data/.neon/ && \
       "pg_distrib_dir='/usr/local/'\n" \
       "listen_pg_addr='0.0.0.0:6400'\n" \
       "listen_http_addr='0.0.0.0:9898'\n" \
-       "availability_zone='local'\n" \
  > /data/.neon/pageserver.toml && \
  chown -R neon:neon /data/.neon

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.81.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    export PATH="$HOME/.cargo/bin:$PATH" && \
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
-    rustup component add llvm-tools rustfmt clippy && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
--- a/README.md
+++ b/README.md
@@ -64,12 +64,6 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```

-If you get errors about missing `m4` you may have to install it manually:
-```
-brew install m4
-brew link --force m4
-```
-
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,10 +22,9 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Escape a string for including it in a SQL literal.
-///
-/// Wrapping the result with `E'{}'` or `'{}'` is not required,
-/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal. Wrapping the result
+/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
+/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -80,10 +80,7 @@ enum Command {
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
-    Tenants {
-        /// If this field is set, it will list the tenants on a specific node
-        node_id: Option<NodeId>,
-    },
+    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
@@ -339,7 +336,7 @@ async fn main() -> anyhow::Result<()> {
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
-                        availability_zone_id,
+                        availability_zone_id: Some(availability_zone_id),
                    }),
                )
                .await?;
@@ -406,41 +403,7 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::Tenants {
-            node_id: Some(node_id),
-        } => {
-            let describe_response = storcon_client
-                .dispatch::<(), NodeShardResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}/shards"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "Shard",
-                "Intended Primary/Secondary",
-                "Observed Primary/Secondary",
-            ]);
-            for shard in shards {
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    match shard.is_intended_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                    match shard.is_observed_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::Tenants { node_id: None } => {
+        Command::Tenants {} => {
            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,7 +68,6 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);

 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
-///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,9 +104,7 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    #[serde(skip_serializing)]
-    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
-    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
    pub io_buffer_alignment: usize,
 }
@@ -211,6 +209,40 @@ pub enum GetImpl {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge { validate: None }
+    }
+}
+
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
@@ -417,7 +449,7 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            compact_level0_phase1_value_access: Default::default(),
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),

            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
    pub listen_http_addr: String,
    pub listen_http_port: u16,

-    pub availability_zone_id: String,
+    pub availability_zone_id: Option<String>,
 }

 #[derive(Serialize, Deserialize)]
@@ -74,17 +74,6 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsRequest {
-    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, String>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsResponse {
-    pub updated: Vec<TenantShardId>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -112,21 +101,6 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShardResponse {
-    pub node_id: NodeId,
-    pub shards: Vec<NodeShard>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShard {
-    pub tenant_shard_id: TenantShardId,
-    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_observed_secondary: Option<bool>,
-    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_intended_secondary: Option<bool>,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
@@ -158,12 +132,8 @@ pub struct TenantDescribeResponseShard {
    pub is_splitting: bool,

    pub scheduling_policy: ShardSchedulingPolicy,
-
-    pub preferred_az_id: Option<String>,
 }

-/// Migration request for a given tenant shard to a given node.
-///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,10 +305,8 @@ pub struct TenantConfig {
    pub lsn_lease_length_for_ts: Option<String>,
 }

-/// The policy for the aux file storage.
-///
-/// It can be switched through `switch_aux_file_policy` tenant config.
-/// When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -898,9 +896,7 @@ pub struct WalRedoManagerStatus {
    pub process: Option<WalRedoManagerProcessStatus>,
 }

-/// The progress of a secondary tenant.
-///
-/// It is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,10 +69,8 @@ impl QueryError {
 }

 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection.
-///
-/// These errors can happen during normal operations,
-/// and don't indicate a bug in our code.
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,7 +7,6 @@ use std::fmt;
 use url::Host;

 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
-///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
    fn include_file(&self, filename: &str) {
        // This does the equivalent of passing bindgen::CargoCallbacks
        // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks::new();
+        let cargo_callbacks = bindgen::CargoCallbacks;
        cargo_callbacks.include_file(filename)
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,8 +45,6 @@ pub use azure_core::Etag;

 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

-/// Default concurrency limit for S3 operations
-///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -302,9 +300,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<(), TimeTravelError>;
 }

-/// Data part of an ongoing [`Download`].
-///
-/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,16 +60,3 @@ pub struct TimelineCopyRequest {
    pub target_timeline_id: TimelineId,
    pub until_lsn: Lsn,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpRequest {
-    /// bump to
-    pub term: Option<u64>,
-}
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpResponse {
-    // before the request
-    pub previous_term: u64,
-    pub current_term: u64,
-}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,10 +5,9 @@
 mod calculation;
 pub mod svg;

-/// StorageModel is the input to the synthetic size calculation.
-///
-/// It represents a tree of timelines, with just the information that's needed
-/// for the calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,10 +5,8 @@ use std::{

 use metrics::IntCounter;

-/// Circuit breakers are for operations that are expensive and fallible.
-///
-/// If a circuit breaker fails repeatedly, we will stop attempting it for some
-/// period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
    /// An identifier that enables us to log useful errors when a circuit is broken
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,4 +1,3 @@
-use std::os::fd::AsRawFd;
 use std::{
    borrow::Cow,
    fs::{self, File},
@@ -204,27 +203,6 @@ pub fn overwrite(
    Ok(())
 }

-/// Syncs the filesystem for the given file descriptor.
-#[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
-    // Linux guarantees durability for syncfs.
-    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-    #[cfg(target_os = "linux")]
-    {
-        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
-    }
-    #[cfg(target_os = "macos")]
-    {
-        // macOS is not a production platform for Neon, don't even bother.
-    }
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        compile_error!("Unsupported OS");
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,10 +249,8 @@ macro_rules! id_newtype {
    };
 }

-/// Neon timeline ID.
-///
-/// They are different from PostgreSQL timeline
-/// IDs, but serve a similar purpose: they differentiate
+/// Neon timeline IDs are different from PostgreSQL timeline
+/// IDs. They serve a similar purpose though: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,9 +100,7 @@ pub enum LockFileRead {
 }

 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content.
-///
-/// It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
 }

 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
    // following rust 1.66.1 std implementation:
    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
    let location = info.location();
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,7 +8,6 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;

 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
-///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,8 +65,6 @@ impl<T> Poison<T> {
    }
 }

-/// Armed pointer to a [`Poison`].
-///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,11 +13,10 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);

-/// Combination of ShardNumber and ShardCount.
-///
-/// For use within the context of a particular tenant, when we need to know which shard we're
-/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
-/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,11 +49,12 @@ use std::sync::{RwLock, RwLockWriteGuard};

 use tokio::sync::watch;

-/// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).
 ///
-/// Storing to the Rcu updates the value, making new readers immediately see
-/// the new value, but it also waits for all current readers to finish.
+/// Rcu allows multiple readers to read and hold onto a value without blocking
+/// (for very long).  Storing to the Rcu updates the value, making new readers
+/// immediately see the new value, but it also waits for all current readers to
+/// finish.
+///
 pub struct Rcu<V> {
    inner: RwLock<RcuInner<V>>,
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,9 +5,7 @@ use std::sync::{
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`.
-///
-/// Allows use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,7 +7,6 @@ pub enum VecMapOrdering {
 }

 /// Ordered map datastructure implemented in a Vec.
-///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,10 +6,9 @@ pub enum YieldingLoopError {
    Cancelled,
 }

-/// Helper for long synchronous loops, e.g. over all tenants in the system.
-///
-/// Periodically yields to avoid blocking the executor, and after resuming
-/// checks the provided cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
    interval: usize,
@@ -24,7 +23,7 @@ where
    for (i, item) in iter.enumerate() {
        visitor(item);

-        if (i + 1) % interval == 0 {
+        if i + 1 % interval == 0 {
            tokio::task::yield_now().await;
            if cancel.is_cancelled() {
                return Err(YieldingLoopError::Cancelled);
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,6 +4,7 @@
 use std::{env, path::PathBuf, process::Command};

 use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;

 fn main() -> anyhow::Result<()> {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -63,25 +64,16 @@ fn main() -> anyhow::Result<()> {
            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
    };

-    let unwind_abi_functions = [
-        "log_internal",
-        "recovery_download",
-        "start_streaming",
-        "finish_sync_safekeepers",
-        "wait_event_set",
-        "WalProposerStart",
-    ];
-
    // The bindgen::Builder is the main entry point
    // to bindgen, and lets you build up options for
    // the resulting bindings.
-    let mut builder = bindgen::Builder::default()
+    let bindings = bindgen::Builder::default()
        // The input header we would like to generate
        // bindings for.
        .header("bindgen_deps.h")
        // Tell cargo to invalidate the built crate whenever any of the
        // included header files changed.
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .parse_callbacks(Box::new(CargoCallbacks))
        .allowlist_type("WalProposer")
        .allowlist_type("WalProposerConfig")
        .allowlist_type("walproposer_api")
@@ -113,12 +105,7 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("WL_SOCKET_MASK")
        .clang_arg("-DWALPROPOSER_LIB")
        .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"));
-
-    for name in unwind_abi_functions {
-        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
-    }
-    let bindings = builder
+        .clang_arg(format!("-I{inc_server_path}"))
        // Finish the builder and generate the bindings.
        .generate()
        // Unwrap the Result and panic on failure.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
    }
 }

-extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
    }
 }

-extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
    }
 }

-extern "C-unwind" fn wait_event_set(
+extern "C" fn wait_event_set(
    wp: *mut WalProposer,
    timeout: ::std::os::raw::c_long,
    event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
    }
 }

-extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
    }
 }

-extern "C-unwind" fn log_internal(
+extern "C" fn log_internal(
    wp: *mut WalProposer,
    level: ::std::os::raw::c_int,
    line: *const ::std::os::raw::c_char,
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,20 +1,2 @@
 pub mod mgmt_api;
 pub mod page_service;
-
-/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
-// If file structure is per-kind not per-feature then where to put this?
-#[derive(Clone, Copy)]
-pub enum BlockUnblock {
-    Block,
-    Unblock,
-}
-
-impl std::fmt::Display for BlockUnblock {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let s = match self {
-            BlockUnblock::Block => "block",
-            BlockUnblock::Unblock => "unblock",
-        };
-        f.write_str(s)
-    }
-}
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,8 +12,6 @@ use utils::{

 pub use reqwest::Body as ReqwestBody;

-use crate::BlockUnblock;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -456,20 +454,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn timeline_block_unblock_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
-            self.mgmt_api_endpoint,
-        );
-
-        self.request(Method::POST, &uri, ()).await.map(|_| ())
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -142,16 +142,11 @@ impl PagestreamClient {
    ) -> anyhow::Result<PagestreamGetPageResponse> {
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));

-        for i in 0..10 {
-            let mut req = tokio_stream::once(Ok(req.clone()));
-            self.copy_both.send_all(&mut req).await?;
-        }
+        self.copy_both.send_all(&mut req).await?;

-        for i in 0..9 {
-            let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
-            let next: bytes::Bytes = next.unwrap()?;
-        }
        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
        let next: bytes::Bytes = next.unwrap()?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -37,7 +37,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -126,6 +125,7 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
+    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    // The tenants directory contains all the pageserver local disk state.
@@ -156,7 +156,23 @@ fn main() -> anyhow::Result<()> {
        };

        let started = Instant::now();
-        syncfs(dirfd)?;
+        // Linux guarantees durability for syncfs.
+        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+        #[cfg(target_os = "linux")]
+        {
+            use std::os::fd::AsRawFd;
+            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
+        }
+        #[cfg(target_os = "macos")]
+        {
+            // macOS is not a production platform for Neon, don't even bother.
+            drop(dirfd);
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            compile_error!("Unsupported OS");
+        }
+
        let elapsed = started.elapsed();
        info!(
            elapsed_ms = elapsed.as_millis(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,14 +174,16 @@ pub struct PageServerConf {

    pub l0_flush: crate::l0_flush::L0FlushConfig,

+    /// This flag is temporary and will be removed after gradual rollout.
+    /// See <https://github.com/neondatabase/neon/issues/8184>.
+    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
+
    /// Direct IO settings
    pub virtual_file_direct_io: virtual_file::DirectIoMode,

    pub io_buffer_alignment: usize,
 }

-/// Token for authentication to safekeepers
-///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
@@ -336,7 +338,7 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access: _,
+            compact_level0_phase1_value_access,
            l0_flush,
            virtual_file_direct_io,
            concurrent_tenant_warmup,
@@ -381,6 +383,7 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
            virtual_file_direct_io,
            io_buffer_alignment,

@@ -558,16 +561,6 @@ mod tests {
            .expect("parse_and_validate");
    }

-    #[test]
-    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
-        let input = indoc::indoc! {r#"
-            [compact_level0_phase1_value_access]
-            mode = "streaming-kmerge"
-            validate = "key-lsn-value"
-        "#};
-        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
-    }
-
    /// If there's a typo in the pageserver config, we'd rather catch that typo
    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
    /// made it in the believe that their config change is effective.
@@ -644,5 +637,14 @@ mod tests {
        //         some_invalid_field = 23
        //     "#}
        // );
+
+        test!(
+            compact_level0_phase1_value_access,
+            indoc! {r#"
+                [compact_level0_phase1_value_access]
+                mode = "streaming-kmerge"
+                some_invalid_field = 23
+            "#}
+        );
    }
 }
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,9 +1,7 @@
-//! Defines [`RequestContext`].
-//!
-//! It is a structure that we use throughout the pageserver to propagate
-//! high-level context from places that _originate_ activity down to the
-//! shared code paths at the heart of the pageserver. It's inspired by
-//! Golang's `context.Context`.
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,24 +141,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        m.other
                    );

-                    let az_id = {
-                        let az_id_from_metadata = m
-                            .other
-                            .get("availability_zone_id")
-                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
-
-                        match az_id_from_metadata {
-                            Some(az_id) => Some(az_id),
-                            None => {
-                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
-                                conf.availability_zone.clone()
-                            }
-                        }
-                    };
-
-                    if az_id.is_none() {
-                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
-                    }
+                    let az_id = m
+                        .other
+                        .get("availability_zone_id")
+                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));

                    Some(NodeRegisterRequest {
                        node_id: conf.id,
@@ -166,7 +152,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        availability_zone_id: az_id.expect("Checked above"),
+                        availability_zone_id: az_id,
                    })
                }
                Err(e) => {
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1185,7 +1185,6 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
-    count: usize,
 }

 impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
@@ -1213,11 +1212,9 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        for _ in 0..self.count {
-            self.global_metric.observe(ex_throttled.as_secs_f64());
-            if let Some(timeline_metric) = self.timeline_metric {
-                timeline_metric.observe(ex_throttled.as_secs_f64());
-            }
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1346,14 +1343,6 @@ impl SmgrQueryTimePerTimeline {
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
-        self.start_timer_many(op, 1, ctx)
-    }
-    pub(crate) fn start_timer_many<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        count: usize,
-        ctx: &'c RequestContext,
    ) -> Option<impl Drop + '_> {
        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
@@ -1387,7 +1376,6 @@ impl SmgrQueryTimePerTimeline {
            ctx,
            start,
            op,
-            count,
        })
    }
 }
@@ -3182,16 +3170,6 @@ static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .unwrap()
 });

-pub(crate) static CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM: Lazy<Histogram> =
-    Lazy::new(|| {
-        register_histogram!(
-            "pageserver_consecutive_nonblocking_getpage_requests",
-            "Number of consecutive nonblocking getpage requests",
-            (0..=256).map(|x| x as f64).collect::<Vec<f64>>(),
-        )
-        .unwrap()
-    });
-
 pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
    let _guard = SERIALIZE.lock().unwrap();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -5,14 +5,14 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
-use once_cell::sync::{Lazy, OnceCell};
-use pageserver_api::models::{self, TenantState};
+use once_cell::sync::OnceCell;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
-    PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
@@ -43,7 +43,7 @@ use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self, CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM};
+use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -58,7 +58,7 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -577,317 +577,124 @@ impl PageServerHandler {
            }
        }

-        let mut batched = None;
-        'outer: loop {
-            enum DebouncedFeMessage {
-                Exists(models::PagestreamExistsRequest),
-                Nblocks(models::PagestreamNblocksRequest),
-                GetPage {
-                    span: Span,
-                    shard: timeline::handle::Handle<TenantManagerTypes>,
-                    effective_request_lsn: Lsn,
-                    pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
-                },
-                DbSize(models::PagestreamDbSizeRequest),
-                GetSlruSegment(models::PagestreamGetSlruSegmentRequest),
-                RespondError(Span, PageStreamError),
-            }
-            let mut debounce: Option<std::time::Instant> = None;
-            // return or `?` on protocol error
-            // `break EXPR` to stop batching. The EXPR will be the first message in the next batch.
-            let next_batched: Option<DebouncedFeMessage> = loop {
-                static BOUNCE_TIMEOUT: Lazy<Duration> = Lazy::new(|| {
-                    utils::env::var::<humantime::Duration, _>("NEON_PAGESERVER_DEBOUNCE")
-                        .unwrap()
-                        .into()
-                });
-                let sleep_fut = if let Some(started_at) = debounce {
-                    futures::future::Either::Left(tokio::time::sleep_until(
-                        (started_at + *BOUNCE_TIMEOUT).into(),
-                    ))
-                } else {
-                    futures::future::Either::Right(futures::future::pending())
-                };
-                let msg = tokio::select! {
-                    biased;
-                    _ = self.cancel.cancelled() => {
-                        return Err(QueryError::Shutdown)
-                    }
-                    msg = pgb.read_message() => {
-                        msg
-                    }
-                    _ = sleep_fut => {
-                        assert!(batched.is_some());
-                        break None;
-                    }
-                };
-                let copy_data_bytes = match msg? {
-                    Some(FeMessage::CopyData(bytes)) => bytes,
-                    Some(FeMessage::Terminate) => break 'outer,
-                    Some(m) => {
-                        return Err(QueryError::Other(anyhow::anyhow!(
-                            "unexpected message: {m:?} during COPY"
-                        )));
-                    }
-                    None => break 'outer, // client disconnected
-                };
-                trace!("query: {copy_data_bytes:?}");
-                fail::fail_point!("ps::handle-pagerequest-message");
-
-                // parse request
-                let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
-
-                let this_msg = match neon_fe_msg {
-                    PagestreamFeMessage::Exists(msg) => DebouncedFeMessage::Exists(msg),
-                    PagestreamFeMessage::Nblocks(msg) => DebouncedFeMessage::Nblocks(msg),
-                    PagestreamFeMessage::DbSize(msg) => DebouncedFeMessage::DbSize(msg),
-                    PagestreamFeMessage::GetSlruSegment(msg) => {
-                        DebouncedFeMessage::GetSlruSegment(msg)
-                    }
-                    PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                        request_lsn,
-                        not_modified_since,
-                        rel,
-                        blkno,
-                    }) => {
-                        let span = tracing::info_span!("handle_get_page_at_lsn_request_batched", %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, batch_size = tracing::field::Empty);
-                        let key = rel_block_to_key(rel, blkno);
-                        let shard = match self
-                            .timeline_handles
-                            .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                            .instrument(span.clone())
-                            .await
-                        {
-                            Ok(tl) => tl,
-                            Err(GetActiveTimelineError::Tenant(
-                                GetActiveTenantError::NotFound(_),
-                            )) => {
-                                // We already know this tenant exists in general, because we resolved it at
-                                // start of connection.  Getting a NotFound here indicates that the shard containing
-                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                                // mapping is out of date.
-                                //
-                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                                // and talk to a different pageserver.
-                                break Some(DebouncedFeMessage::RespondError(
-                                    span,
-                                    PageStreamError::Reconnect(
-                                        "getpage@lsn request routed to wrong shard".into(),
-                                    ),
-                                ));
-                            }
-                            Err(e) => break Some(DebouncedFeMessage::RespondError(span, e.into())),
-                        };
-                        let effective_request_lsn = match Self::wait_or_get_last_lsn(
-                            &shard,
-                            request_lsn,
-                            not_modified_since,
-                            &shard.get_latest_gc_cutoff_lsn(),
-                            &ctx,
-                        )
-                        // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                        .await
-                        {
-                            Ok(lsn) => lsn,
-                            Err(e) => {
-                                break Some(DebouncedFeMessage::RespondError(span, e));
-                            }
-                        };
-                        DebouncedFeMessage::GetPage {
-                            span,
-                            shard,
-                            effective_request_lsn,
-                            pages: smallvec::smallvec![(rel, blkno)],
-                        }
-                    }
-                };
-
-                // check if we can debounce
-                match (&mut batched, this_msg) {
-                    (None, this_msg) => {
-                        batched = Some(this_msg);
-                    }
-                    (
-                        Some(DebouncedFeMessage::GetPage {
-                            span: _,
-                            shard: accum_shard,
-                            pages: accum_pages,
-                            effective_request_lsn: accum_lsn,
-                        }),
-                        DebouncedFeMessage::GetPage {
-                            span: _,
-                            shard: this_shard,
-                            pages: this_pages,
-                            effective_request_lsn: this_lsn,
-                        },
-                    ) if async {
-                        assert_eq!(this_pages.len(), 1);
-                        if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
-                            assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
-                            return false;
-                        }
-                        if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                            != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                        {
-                            // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                            // But the current logig for keeping responses in order does not support that.
-                            return false;
-                        }
-                        // the vectored get currently only supports a single LSN, so, bounce as soon
-                        // as the effective request_lsn changes
-                        return *accum_lsn == this_lsn;
-                    }
-                    .await =>
-                    {
-                        // ok to batch
-                        accum_pages.extend(this_pages);
-                    }
-                    (Some(_), this_msg) => {
-                        // by default, don't continue batching
-                        break Some(this_msg);
-                    }
+        loop {
+            // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
+            let msg = tokio::select! {
+                biased;
+                _ = self.cancel.cancelled() => {
+                    return Err(QueryError::Shutdown)
                }
-
-                // debounce impl piece
-                let started_at = debounce.get_or_insert_with(Instant::now);
-                if started_at.elapsed() > *BOUNCE_TIMEOUT {
-                    break None;
+                msg = pgb.read_message() => { msg }
+            };
+            let copy_data_bytes = match msg? {
+                Some(FeMessage::CopyData(bytes)) => bytes,
+                Some(FeMessage::Terminate) => break,
+                Some(m) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
                }
+                None => break, // client disconnected
            };

+            trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");
+
+            // parse request
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
            // invoke handler function
-            let (handler_results, span): (
-                smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]>,
-                _,
-            ) = match batched.take().expect("loop above ensures this") {
-                DebouncedFeMessage::Exists(req) => {
+            let (handler_result, span) = match neon_fe_msg {
+                PagestreamFeMessage::Exists(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await
-                        ],
+                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::Nblocks(req) => {
+                PagestreamFeMessage::Nblocks(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
+                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::GetPage {
-                    span,
-                    shard,
-                    effective_request_lsn,
-                    pages,
-                } => {
-                    CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM.observe(pages.len() as f64);
-                    span.record("batch_size", pages.len() as u64);
+                PagestreamFeMessage::GetPage(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
-                        {
-                            let npages = pages.len();
-                            let res = self
-                                .handle_get_page_at_lsn_request_batched(
-                                    &shard,
-                                    effective_request_lsn,
-                                    pages,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await;
-                            assert_eq!(res.len(), npages);
-                            res
-                        },
+                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::DbSize(req) => {
+                PagestreamFeMessage::DbSize(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await
-                        ],
+                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::GetSlruSegment(req) => {
+                PagestreamFeMessage::GetSlruSegment(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_slru_segment_request(
-                                tenant_id,
-                                timeline_id,
-                                &req,
-                                &ctx
-                            )
+                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
-                            .await
-                        ],
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::RespondError(span, e) => {
-                    // We've already decided to respond with an error, so we don't need to
-                    // call the handler.
-                    (smallvec::smallvec![Err(e)], span)
-                }
            };

            // Map handler result to protocol behavior.
            // Some handler errors cause exit from pagestream protocol.
            // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-            for handler_result in handler_results {
-                let response_msg = match handler_result {
-                    Err(e) => match &e {
-                        PageStreamError::Shutdown => {
-                            // If we fail to fulfil a request during shutdown, which may be _because_ of
-                            // shutdown, then do not send the error to the client.  Instead just drop the
-                            // connection.
-                            span.in_scope(|| info!("dropping connection due to shutdown"));
-                            return Err(QueryError::Shutdown);
-                        }
-                        PageStreamError::Reconnect(reason) => {
-                            span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                            return Err(QueryError::Reconnect);
-                        }
-                        PageStreamError::Read(_)
-                        | PageStreamError::LsnTimeout(_)
-                        | PageStreamError::NotFound(_)
-                        | PageStreamError::BadRequest(_) => {
-                            // print the all details to the log with {:#}, but for the client the
-                            // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                            // here includes cancellation which is not an error.
-                            let full = utils::error::report_compact_sources(&e);
-                            span.in_scope(|| {
-                                error!("error reading relation or page version: {full:#}")
-                            });
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                message: e.to_string(),
-                            })
-                        }
-                    },
-                    Ok(response_msg) => response_msg,
-                };
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    }
+                },
+                Ok(response_msg) => response_msg,
+            };

-                // marshal & transmit response message
-                pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-            }
+            // marshal & transmit response message
+            pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
            tokio::select! {
                biased;
                _ = self.cancel.cancelled() => {
@@ -899,9 +706,6 @@ impl PageServerHandler {
                    res?;
                }
            }
-
-            assert!(batched.is_none(), "we take() earlier");
-            batched = next_batched;
        }
        Ok(())
    }
@@ -1145,30 +949,60 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all)]
-    async fn handle_get_page_at_lsn_request_batched(
+    #[instrument(skip_all, fields(shard_id))]
+    async fn handle_get_page_at_lsn_request(
        &mut self,
-        timeline: &Timeline,
-        effective_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let _timer = timeline.query_metrics.start_timer_many(
-            metrics::SmgrQueryType::GetPageAtLsn,
-            pages.len(),
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = match self
+            .timeline_handles
+            .get(
+                tenant_id,
+                timeline_id,
+                ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
+            )
+            .await
+        {
+            Ok(tl) => tl,
+            Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                // We already know this tenant exists in general, because we resolved it at
+                // start of connection.  Getting a NotFound here indicates that the shard containing
+                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                // mapping is out of date.
+                //
+                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                // and talk to a different pageserver.
+                return Err(PageStreamError::Reconnect(
+                    "getpage@lsn request routed to wrong shard".into(),
+                ));
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
+
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(
+            &timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
            ctx,
-        );
+        )
+        .await?;

-        let pages = timeline
-            .get_rel_page_at_lsn_batched(pages, Version::Lsn(effective_lsn), ctx)
-            .await;
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
+            .await?;

-        smallvec::SmallVec::from_iter(pages.into_iter().map(|page| {
-            page.map(|page| {
-                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
-            })
-            .map_err(PageStreamError::Read)
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
        }))
    }

@@ -1665,10 +1499,3 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
    );
    debug_assert_current_span_has_tenant_and_timeline_id();
 }
-
-struct WaitedForLsn(Lsn);
-impl From<WaitedForLsn> for Lsn {
-    fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self {
-        lsn
-    }
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,17 +9,12 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::span::{
-    debug_assert_current_span_has_tenant_and_timeline_id,
-    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
-};
-use crate::tenant::timeline::GetVectoredError;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -33,7 +28,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
+use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
@@ -196,184 +191,26 @@ impl Timeline {
        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
-        let pages = smallvec::smallvec![(tag, blknum)];
-        let res = self.get_rel_page_at_lsn_batched(pages, version, ctx).await;
-        assert_eq!(res.len(), 1);
-        res.into_iter().next().unwrap()
-    }
-
-    /// Like [`get_rel_page_at_lsn`], but returns a batch of pages.
-    pub(crate) async fn get_rel_page_at_lsn_batched(
-        &self,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
-        version: Version<'_>,
-        ctx: &RequestContext,
-    ) -> smallvec::SmallVec<[Result<Bytes, PageReconstructError>; 1]> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let request_lsn = match version {
-            Version::Lsn(lsn) => lsn,
-            Version::Modified(_) => panic!("unsupported"),
-        };
-        enum KeyState {
-            NeedsVectoredGet,
-            Done(Result<Bytes, PageReconstructError>),
+        if tag.relnode == 0 {
+            return Err(PageReconstructError::Other(
+                RelationError::InvalidRelnode.into(),
+            ));
        }
-        let mut key_states = BTreeMap::new();
-        let mut vectored_gets: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(pages.len());
-        for (response_order, (tag, blknum)) in pages.into_iter().enumerate() {
-            let key = rel_block_to_key(tag, blknum);
-            use std::collections::btree_map::Entry;
-            let key_state_slot = match key_states.entry((key, response_order)) {
-                Entry::Occupied(_entry) => unreachable!(
-                    "enumerate makes keys unique, even if batch contains same key twice"
-                ),
-                Entry::Vacant(entry) => entry,
-            };

-            if tag.relnode == 0 {
-                key_state_slot.insert(KeyState::Done(Err(PageReconstructError::Other(
-                    RelationError::InvalidRelnode.into(),
-                ))));
-                continue;
-            }
-
-            let nblocks = match self.get_rel_size(tag, version, ctx).await {
-                Ok(nblocks) => nblocks,
-                Err(err) => {
-                    key_state_slot.insert(KeyState::Done(Err(err)));
-                    continue;
-                }
-            };
-            if blknum >= nblocks {
-                debug!(
-                    "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag,
-                    blknum,
-                    version.get_lsn(),
-                    nblocks
-                );
-                key_state_slot.insert(KeyState::Done(Ok(ZERO_PAGE.clone())));
-                continue;
-            }
-
-            vectored_gets.push(key);
-            key_state_slot.insert(KeyState::NeedsVectoredGet);
+        let nblocks = self.get_rel_size(tag, version, ctx).await?;
+        if blknum >= nblocks {
+            debug!(
+                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
+            );
+            return Ok(ZERO_PAGE.clone());
        }
-        // turn vectored_gets into a keyspace
-        let keyspace = {
-            // add_key reuqires monotonicity
-            vectored_gets.sort_unstable();
-            let mut acc = KeySpaceAccum::new();
-            for key in vectored_gets
-                .into_iter()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(key);
-            }
-            acc.to_keyspace()
-        };

-        match self.get_vectored(keyspace, request_lsn, ctx).await {
-            Ok(results) => {
-                for (key, res) in results {
-                    if let Err(err) = &res {
-                        warn!(%key, ?err, "a key inside get_vectored failed with a per-key error");
-                    }
-                    let mut interests = key_states.range_mut((key, 0)..(key.next(), 0)).peekable();
-                    let first_interest = interests.next().unwrap();
-                    let next_interest = interests.peek().is_some();
-                    if !next_interest {
-                        match first_interest.1 {
-                            KeyState::NeedsVectoredGet => {
-                                *first_interest.1 = KeyState::Done(res);
-                            }
-                            KeyState::Done(_) => unreachable!(),
-                        }
-                        continue;
-                    } else {
-                        for ((_, _), state) in [first_interest].into_iter().chain(interests) {
-                            match state {
-                                KeyState::NeedsVectoredGet => {
-                                    *state = KeyState::Done(match &res {
-                                        Ok(buf) => Ok(buf.clone()),
-                                        // this `match` is working around the fact that we cannot Clone the PageReconstructError
-                                        Err(err) => Err(match err {
-                                            PageReconstructError::Cancelled => {
-                                                PageReconstructError::Cancelled
-                                            }
-
-                                            x @ PageReconstructError::Other(_) |
-                                            x @ PageReconstructError::AncestorLsnTimeout(_) |
-                                            x @ PageReconstructError::WalRedo(_) |
-                                            x @ PageReconstructError::MissingKey(_) => {
-                                                PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
-                                            },
-                                        }),
-                                    });
-                                }
-                                KeyState::Done(_) => unreachable!(),
-                            }
-                        }
-                    }
-                }
-            }
-            Err(err) => {
-                warn!(?err, "get_vectored failed with a global error, mapping that error to per-key failure");
-                // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
-                for ((_, _), state) in key_states.iter_mut() {
-                    // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
-                    // but without taking ownership of the GetVectoredError
-                    match &err {
-                        GetVectoredError::Cancelled => {
-                            *state = KeyState::Done(Err(PageReconstructError::Cancelled));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::MissingKey(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::GetReadyAncestorError(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::Other(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(
-                                anyhow::anyhow!("whole vectored get request failed: {err:?}"),
-                            )));
-                        }
-                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::InvalidLsn(e) => {
-                            *state =
-                                KeyState::Done(Err(anyhow::anyhow!("invalid LSN: {e:?}").into()));
-                        }
-                        // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
-                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::Oversized(err) => {
-                            *state = KeyState::Done(Err(anyhow::anyhow!(
-                                "batching oversized: {err:?}"
-                            )
-                            .into()));
-                        }
-                    }
-                }
-            }
-        };
-
-        // get the results into the order in which they were requested
-        let mut return_order: smallvec::SmallVec<[_; Timeline::MAX_GET_VECTORED_KEYS as usize]> =
-            smallvec::SmallVec::with_capacity(key_states.len());
-        return_order.extend(key_states.keys().map(|(key, idx)| (*key, *idx)));
-        return_order.sort_unstable_by_key(|(_, idx)| *idx);
-        let mut res = smallvec::SmallVec::with_capacity(key_states.len());
-        res.extend(return_order.into_iter().map(|key_states_key| {
-            match key_states.remove(&key_states_key).unwrap() {
-                KeyState::Done(res) => res,
-                KeyState::NeedsVectoredGet => unreachable!(),
-            }
-        }));
-        res
+        let key = rel_block_to_key(tag, blknum);
+        version.get(self, key, ctx).await
    }

    // Get size of a database in blocks
@@ -1184,10 +1021,9 @@ impl Timeline {
 }

 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository.
-///
-/// It is created by the 'begin_record' function. It is called for each WAL
-/// record, so that all the modifications by a one WAL record appear atomic.
+/// updates to the repository. It is created by the 'begin_record'
+/// function. It is called for each WAL record, so that all the modifications
+/// by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
@@ -2212,7 +2048,6 @@ impl<'a> DatadirModification<'a> {

 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
-///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -73,21 +73,6 @@ impl ValueBytes {

        Ok(raw[8] == 1)
    }
-
-    pub(crate) fn is_image(raw: &[u8]) -> Result<bool, InvalidInput> {
-        if raw.len() < 12 {
-            return Err(InvalidInput::TooShortValue);
-        }
-
-        let value_discriminator = &raw[0..4];
-
-        if value_discriminator == [0, 0, 0, 0] {
-            // Value::Image always initializes
-            return Ok(true);
-        }
-
-        Ok(false)
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,9 +1,8 @@
-//! Timeline repository implementation that keeps old data in layer files, and
-//! the recent changes in ephemeral files.
 //!
-//! See tenant/*_layer.rs files. The functions here are responsible for locating
-//! the correct layer for the get/put call, walking back the timeline branching
-//! history as needed.
+//! Timeline repository implementation that keeps old data in files on disk, and
+//! the recent changes in memory. See tenant/*_layer.rs files.
+//! The functions here are responsible for locating the correct layer for the
+//! get/put call, walking back the timeline branching history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,8 +1,7 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata.
-//!
-//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
-//! their timelines, this struct and its original serialization format is still needed because
-//! they were written a long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
+//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
+//! this struct and it's original serialization format is still needed because they were written a
+//! long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,10 +282,9 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

-/// Responsible for storing and mutating the collection of all tenants
-/// that this pageserver has state for.
-///
-/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
+/// The TenantManager is responsible for storing and mutating the collection of all tenants
+/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
+/// lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2347,9 +2346,8 @@ pub enum TenantMapError {
    ShuttingDown,
 }

-/// Guards a particular tenant_id's content in the TenantsMap.
-///
-/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.  While this
+/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,8 +2184,6 @@ pub fn remote_timeline_path(
    remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }

-/// Obtains the path of the given Layer in the remote
-///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
        cancel,
    )
    .await
-    .inspect_err(|_e| {
+    .map_err(|e| {
        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
        // We don't have async here nor do we want to pile on any extra errors.
        if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,6 +556,7 @@ pub(crate) async fn download_initdb_tar_zst(
                warn!("error deleting temporary file {temp_path}: {e}");
            }
        }
+        e
    })?;

    Ok((temp_path, file))
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,5 +1,4 @@
 //! In-memory index to track the tenant files on the remote storage.
-//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,17 +8,15 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-use tokio::sync::{self};
-use utils::bin_ser::BeSer;
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::{Value, ValueBytes};
+use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
-use pageserver_api::key::{Key, DBDIR_KEY};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use std::cmp::Ordering;
+use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
@@ -81,18 +79,12 @@ pub(crate) enum ValueReconstructSituation {
 }

 /// Reconstruct data accumulated for a single key during a vectored get
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct VectoredValueReconstructState {
-    pub(crate) records: Vec<(
-        Lsn,
-        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
-    )>,
-    pub(crate) img: Option<(
-        Lsn,
-        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
-    )>,
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,

-    pub(crate) situation: ValueReconstructSituation,
+    situation: ValueReconstructSituation,
 }

 impl VectoredValueReconstructState {
@@ -101,57 +93,16 @@ impl VectoredValueReconstructState {
    }
 }

-pub(crate) async fn convert(
-    _key: Key,
-    from: VectoredValueReconstructState,
-) -> Result<ValueReconstructState, PageReconstructError> {
-    let mut to = ValueReconstructState::default();
+impl From<VectoredValueReconstructState> for ValueReconstructState {
+    fn from(mut state: VectoredValueReconstructState) -> Self {
+        // walredo expects the records to be descending in terms of Lsn
+        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));

-    for (lsn, fut) in from.records {
-        match fut.await {
-            Ok(res) => match res {
-                Ok(bytes) => {
-                    let value = Value::des(&bytes)
-                        .map_err(|err| PageReconstructError::Other(err.into()))?;
-
-                    match value {
-                        Value::WalRecord(rec) => {
-                            to.records.push((lsn, rec));
-                        },
-                        Value::Image(img) => {
-                            assert!(to.img.is_none());
-                            to.img = Some((lsn, img));
-                        }
-                    }
-                }
-                Err(err) => {
-                    return Err(PageReconstructError::Other(err.into()));
-                }
-            },
-            Err(err) => {
-                return Err(PageReconstructError::Other(err.into()));
-            }
+        ValueReconstructState {
+            records: state.records,
+            img: state.img,
        }
    }
-
-    if to.img.is_none() {
-        let (lsn, fut) = from.img.expect("Need an image");
-        match fut.await {
-            Ok(res) => match res {
-                Ok(bytes) => {
-                    to.img = Some((lsn, bytes));
-                }
-                Err(err) => {
-                    return Err(PageReconstructError::Other(err.into()));
-                }
-            },
-            Err(err) => {
-                return Err(PageReconstructError::Other(err.into()));
-            }
-        }
-    }
-
-    Ok(to)
 }

 /// Bag of data accumulated during a vectored get..
@@ -249,8 +200,7 @@ impl ValuesReconstructState {
        &mut self,
        key: &Key,
        lsn: Lsn,
-        completes: bool,
-        value: sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
+        value: Value,
    ) -> ValueReconstructSituation {
        let state = self
            .keys
@@ -258,14 +208,31 @@ impl ValuesReconstructState {
            .or_insert(Ok(VectoredValueReconstructState::default()));

        if let Ok(state) = state {
-            match state.situation {
+            let key_done = match state.situation {
                ValueReconstructSituation::Complete => unreachable!(),
-                ValueReconstructSituation::Continue => {
-                    state.records.push((lsn, value));
-                }
-            }
+                ValueReconstructSituation::Continue => match value {
+                    Value::Image(img) => {
+                        state.img = Some((lsn, img));
+                        true
+                    }
+                    Value::WalRecord(rec) => {
+                        debug_assert!(
+                            Some(lsn) > state.get_cached_lsn(),
+                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
+                            lsn,
+                            state
+                                .get_cached_lsn()
+                                .expect("Assertion can only fire if a cached lsn is present")
+                        );

-            if completes && state.situation == ValueReconstructSituation::Continue {
+                        let will_init = rec.will_init();
+                        state.records.push((lsn, rec));
+                        will_init
+                    }
+                },
+            };
+
+            if key_done && state.situation == ValueReconstructSituation::Continue {
                state.situation = ValueReconstructSituation::Complete;
                self.keys_done.add_key(*key);
            }
@@ -467,11 +434,10 @@ impl ReadableLayer {
    }
 }

-/// Layers contain a hint indicating whether they are likely to be used for reads.
-///
-/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
-/// when changing the visibility of layers (for example when creating a branch that makes some previously
-/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
+/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
+/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
+/// be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,12 +42,13 @@ use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -57,14 +58,14 @@ use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::sync::{self, OnceCell};
+use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
 use tracing::*;

@@ -135,11 +136,10 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

-/// Struct representing reference to BLOB in layers.
-///
-/// Reference contains BLOB offset, and for WAL records it also contains
-/// `will_init` flag. The flag helps to determine the range of records
-/// that needs to be applied, without reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers. Reference contains BLOB
+/// offset, and for WAL records it also contains `will_init` flag. The flag
+/// helps to determine the range of records that needs to be applied, without
+/// reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -223,7 +223,7 @@ pub struct DeltaLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    file: Arc<VirtualFile>,
+    file: VirtualFile,
    file_id: FileId,

    layer_key_range: Range<Key>,
@@ -787,11 +787,9 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = Arc::new(
-            VirtualFile::open(path, ctx)
-                .await
-                .context("open layer file")?,
-        );
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;

        let file_id = page_cache::next_file_id();

@@ -981,59 +979,77 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
        let max_vectored_read_bytes = self
            .max_vectored_read_bytes
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
+        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
        // track when a key is done.
        for read in reads.into_iter().rev() {
-            let mut senders: HashMap<
-                (Key, Lsn),
-                sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
-            > = Default::default();
-            for (_, blob_meta) in read.blobs_at.as_slice() {
-                let (tx, rx) = sync::oneshot::channel();
-                senders.insert((blob_meta.key, blob_meta.lsn), tx);
-                reconstruct_state.update_key(
-                    &blob_meta.key,
-                    blob_meta.lsn,
-                    blob_meta.will_init,
-                    rx,
-                );
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::Other(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(buf_size));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
            }

-            let read_from = self.file.clone();
-            let read_ctx = ctx.attached_child();
-            tokio::task::spawn(async move {
-                let vectored_blob_reader = VectoredBlobReader::new(&read_from);
-                let buf = BytesMut::with_capacity(buf_size);
-
-                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
-                match res {
-                    Ok(blobs_buf) => {
-                        for meta in blobs_buf.blobs.iter().rev() {
-                            let buf = &blobs_buf.buf[meta.start..meta.end];
-                            let sender = senders
-                                .remove(&(meta.meta.key, meta.meta.lsn))
-                                .expect("sender must exist");
-                            let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
-                        }
-
-                        assert!(senders.is_empty());
-                    }
-                    Err(err) => {
-                        for (_, sender) in senders {
-                            let _ = sender
-                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
-                        }
-                    }
-                }
-            });
+            buf = Some(blobs_buf.buf);
        }
    }

@@ -1173,14 +1189,7 @@ impl DeltaLayerInner {
            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                let end_offset = offset;

-                Some((
-                    BlobMeta {
-                        key,
-                        lsn,
-                        will_init: false,
-                    },
-                    start_offset..end_offset,
-                ))
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
            } else {
                None
            };
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,9 +1,7 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN.
-//!
-//! It contains an image of all key-value pairs in its key-range. Any key
-//! that falls into the image layer's range but does not exist in the layer,
-//! does not exist.
+//! one particular LSN. It contains an image of all key-value pairs
+//! in its key-range. Any key that falls into the image layer's range
+//! but does not exist in the layer, does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
@@ -21,7 +19,7 @@
 //!
 //! Every image layer file consists of three parts: "summary",
 //! "index", and "values".  The summary is a fixed size header at the
-//! beginningof the file, and it contains basic information about the
+//! beginning of the file, and it contains basic information about the
 //! layer, and offsets to the other parts. The "index" is a B-tree,
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
@@ -38,11 +36,11 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::Timeline;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
@@ -52,14 +50,13 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::sync::oneshot;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -164,7 +161,7 @@ pub struct ImageLayerInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    file: Arc<VirtualFile>,
+    file: VirtualFile,
    file_id: FileId,

    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -391,11 +388,9 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = Arc::new(
-            VirtualFile::open(path, ctx)
-                .await
-                .context("open layer file")?,
-        );
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
        let block_reader = FileBlockReader::new(&file, file_id);
        let summary_blk = block_reader
@@ -582,16 +577,8 @@ impl ImageLayerInner {
            .0
            .into();

+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let mut senders: HashMap<(Key, Lsn), oneshot::Sender<Result<Bytes, std::io::Error>>> =
-                Default::default();
-            for (_, blob_meta) in read.blobs_at.as_slice() {
-                let (tx, rx) = oneshot::channel();
-                senders.insert((blob_meta.key, blob_meta.lsn), tx);
-
-                reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true, rx);
-            }
-
            let buf_size = read.size();

            if buf_size > max_vectored_read_bytes {
@@ -610,33 +597,36 @@ impl ImageLayerInner {
                );
            }

-            let read_from = self.file.clone();
-            let read_ctx = ctx.attached_child();
-            tokio::task::spawn(async move {
-                let buf = BytesMut::with_capacity(buf_size);
-                let vectored_blob_reader = VectoredBlobReader::new(&*read_from);
-                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
+            let buf = BytesMut::with_capacity(buf_size);
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

-                match res {
-                    Ok(blobs_buf) => {
-                        for meta in blobs_buf.blobs.iter().rev() {
-                            let buf = &blobs_buf.buf[meta.start..meta.end];
-                            let sender = senders
-                                .remove(&(meta.meta.key, meta.meta.lsn))
-                                .expect("sender must exist");
-                            let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
-                        }
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();

-                        assert!(senders.is_empty());
-                    }
-                    Err(err) => {
-                        for (_, sender) in senders {
-                            let _ = sender
-                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
-                        }
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
                    }
                }
-            });
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
        }
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,9 +10,10 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -34,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+};

 pub(crate) mod vectored_dio_read;

@@ -84,7 +87,7 @@ pub struct InMemoryLayerInner {
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
-    file: Arc<tokio::sync::RwLock<EphemeralFile>>,
+    file: EphemeralFile,

    resource_units: GlobalResourceUnits,
 }
@@ -378,11 +381,7 @@ impl InMemoryLayer {
    }

    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner
-            .try_read()
-            .map(|i| i.file.try_read().map(|i| i.len()).ok())
-            .ok()
-            .flatten()
+        self.inner.try_read().map(|i| i.file.len()).ok()
    }

    pub(crate) fn assert_writable(&self) {
@@ -433,10 +432,6 @@ impl InMemoryLayer {
            read: vectored_dio_read::LogicalRead<Vec<u8>>,
        }
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
-        let mut senders: HashMap<
-            (Key, Lsn),
-            tokio::sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
-        > = Default::default();

        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
@@ -464,11 +459,6 @@ impl InMemoryLayer {
                            Vec::with_capacity(len as usize),
                        ),
                    });
-
-                    let (tx, rx) = tokio::sync::oneshot::channel();
-                    senders.insert((key, *entry_lsn), tx);
-                    reconstruct_state.update_key(&key, *entry_lsn, will_init, rx);
-
                    if will_init {
                        break;
                    }
@@ -476,42 +466,46 @@ impl InMemoryLayer {
            }
        }

-        let read_from = inner.file.clone();
-        let read_ctx = ctx.attached_child();
-        tokio::task::spawn(async move {
-            let locked = read_from.read().await;
-            let f = vectored_dio_read::execute(
-                &*locked,
-                reads
-                    .iter()
-                    .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-                &read_ctx,
-            );
-            send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-                .await;
+        // Execute the reads.

-            for (key, value_reads) in reads {
-                for ValueRead { entry_lsn, read } in value_reads {
-                    let sender = senders
-                        .remove(&(key, entry_lsn))
-                        .expect("sender must exist");
-                    match read.into_result().expect("we run execute() above") {
-                        Err(e) => {
-                            let sender = senders
-                                .remove(&(key, entry_lsn))
-                                .expect("sender must exist");
-                            let _ = sender
-                                .send(Err(std::io::Error::new(e.kind(), "dio vec read failed")));
+        let f = vectored_dio_read::execute(
+            &inner.file,
+            reads
+                .iter()
+                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+            &ctx,
+        );
+        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+            .await;
+
+        // Process results into the reconstruct state
+        'next_key: for (key, value_reads) in reads {
+            for ValueRead { entry_lsn, read } in value_reads {
+                match read.into_result().expect("we run execute() above") {
+                    Err(e) => {
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        continue 'next_key;
+                    }
+                    Ok(value_buf) => {
+                        let value = Value::des(&value_buf);
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            continue 'next_key;
                        }
-                        Ok(value_buf) => {
-                            let _ = sender.send(Ok(value_buf.into()));
+
+                        let key_situation =
+                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            // TODO: metric to see if we fetched more values than necessary
+                            continue 'next_key;
                        }
+
+                        // process the next value in the next iteration of the loop
                    }
                }
            }
-
-            assert!(senders.is_empty());
-        });
+        }

        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);

@@ -606,8 +600,7 @@ impl InMemoryLayer {
    /// Get layer size.
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        let locked = inner.file.try_read().expect("no contention");
-        Ok(locked.len())
+        Ok(inner.file.len())
    }

    /// Create a new, empty, in-memory layer
@@ -621,10 +614,9 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = Arc::new(tokio::sync::RwLock::new(
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?,
-        ));
-        let key = InMemoryLayerFileId(file.read().await.page_cache_file_id());
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
@@ -656,7 +648,7 @@ impl InMemoryLayer {
        let mut inner = self.inner.write().await;
        self.assert_writable();

-        let base_offset = inner.file.read().await.len();
+        let base_offset = inner.file.len();

        let SerializedBatch {
            raw,
@@ -680,13 +672,8 @@ impl InMemoryLayer {
        }

        // Write the batch to the file
-        // FIXME: can't borrow arc
-        let new_size = {
-            let mut locked = inner.file.write().await;
-            locked.write_raw(&raw, ctx).await?;
-            locked.len()
-        };
-
+        inner.file.write_raw(&raw, ctx).await?;
+        let new_size = inner.file.len();
        let expected_new_len = base_offset
            .checked_add(raw.len().into_u64())
            // write_raw would error if we were to overflow u64.
@@ -726,7 +713,7 @@ impl InMemoryLayer {

    pub(crate) async fn tick(&self) -> Option<u64> {
        let mut inner = self.inner.write().await;
-        let size = inner.file.read().await.len();
+        let size = inner.file.len();
        inner.resource_units.publish_size(size)
    }

@@ -822,7 +809,7 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.read().await.load_to_vec(ctx).await?;
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;

                let file_contents = Bytes::from(file_contents);

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -107,8 +107,6 @@ async fn smoke_test() {
            .expect("tenant harness writes the control file")
    };

-    let img_before = (img_before.0, img_before.1.await.unwrap().unwrap());
-    let img_after = (img_after.0, img_after.1.await.unwrap().unwrap());
    assert_eq!(img_before, img_after);

    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,10 +12,8 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;

-/// A unique identifier of a persistent layer.
-///
-/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
-/// This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,9 +217,8 @@ impl fmt::Display for ImageLayerName {
    }
 }

-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
-///
-/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
+/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,11 +226,9 @@ impl<'a> IteratorWrapper<'a> {
    }
 }

-/// A merge iterator over delta/image layer iterators.
-///
-/// When duplicated records are found, the iterator will not perform any
-/// deduplication, and the caller should handle these situation. By saying
-/// duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,10 +34,9 @@ impl SplitWriterResult {
    }
 }

-/// An image writer that takes images and produces multiple image layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the image layer generation
-/// fails, there might be leftover files to be cleaned up)
+/// An image writer that takes images and produces multiple image layers. The interface does not
+/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
+/// to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
@@ -194,10 +193,9 @@ impl SplitImageLayerWriter {
    }
 }

-/// A delta writer that takes key-lsn-values and produces multiple delta layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
-/// there might be leftover files to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,7 +18,6 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
@@ -69,9 +68,7 @@ use crate::{
    tenant::{
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::{
-            convert, inmemory_layer::IndexEntry, PersistentLayerDesc, ValueReconstructSituation,
-        },
+        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
    },
    walredo,
 };
@@ -1132,38 +1129,22 @@ impl Timeline {
        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
            .for_get_kind(get_kind)
            .start_timer();
+        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
        let layers_visited = reconstruct_state.get_layers_visited();

-        let futs = FuturesUnordered::new();
        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
-            futs.push({
-                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
-                async move {
-                    let state = res.expect("Read path is infallible");
-                    assert!(matches!(
-                        state.situation,
-                        ValueReconstructSituation::Complete
-                    ));
-
-                    let converted = match convert(key, state).await {
-                        Ok(ok) => ok,
-                        Err(err) => {
-                            return (key, Err(err));
-                        }
-                    };
-
-                    (
-                        key,
-                        walredo_self.reconstruct_value(key, lsn, converted).await,
-                    )
+            match res {
+                Err(err) => {
+                    results.insert(key, Err(err));
                }
-            });
+                Ok(state) => {
+                    let state = ValueReconstructState::from(state);
+
+                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
+                    results.insert(key, reconstruct_res);
+                }
+            }
        }
-
-        let results = futs
-            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
-            .await;
-
        reconstruct_timer.stop_and_record();

        // For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -5515,30 +5496,30 @@ impl Timeline {
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
        self: &Arc<Timeline>,
-        _lsn: Lsn,
-        _ctx: &RequestContext,
+        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
-        // let mut all_data = Vec::new();
-        // let guard = self.layers.read().await;
-        // for layer in guard.layer_map()?.iter_historic_layers() {
-        //     if !layer.is_delta() && layer.image_layer_lsn() == lsn {
-        //         let layer = guard.get_from_desc(&layer);
-        //         let mut reconstruct_data = ValuesReconstructState::default();
-        //         layer
-        //             .get_values_reconstruct_data(
-        //                 KeySpace::single(Key::MIN..Key::MAX),
-        //                 lsn..Lsn(lsn.0 + 1),
-        //                 &mut reconstruct_data,
-        //                 ctx,
-        //             )
-        //             .await?;
-        //         for (k, v) in reconstruct_data.keys {
-        //             all_data.push((k, v?.img.unwrap().1));
-        //         }
-        //     }
-        // }
-        // all_data.sort();
-        Ok(Vec::new())
+        let mut all_data = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map()?.iter_historic_layers() {
+            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+                let layer = guard.get_from_desc(&layer);
+                let mut reconstruct_data = ValuesReconstructState::default();
+                layer
+                    .get_values_reconstruct_data(
+                        KeySpace::single(Key::MIN..Key::MAX),
+                        lsn..Lsn(lsn.0 + 1),
+                        &mut reconstruct_data,
+                        ctx,
+                    )
+                    .await?;
+                for (k, v) in reconstruct_data.keys {
+                    all_data.push((k, v?.img.unwrap().1));
+                }
+            }
+        }
+        all_data.sort();
+        Ok(all_data)
    }

    /// Get all historic layer descriptors in the layer map
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,6 +19,7 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -911,13 +912,137 @@ impl Timeline {
        // we're compacting, in key, LSN order.
        // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
        // then the Value::Image is ordered before Value::WalRecord.
-        let mut all_values_iter = {
-            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-            for l in deltas_to_compact.iter() {
-                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                deltas.push(l);
+        //
+        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
+        // option and validation code once we've reached confidence.
+        enum AllValuesIter<'a> {
+            PageCachedBlobIo {
+                all_keys_iter: VecIter<'a>,
+            },
+            StreamingKmergeBypassingPageCache {
+                merge_iter: MergeIterator<'a>,
+            },
+            ValidatingStreamingKmergeBypassingPageCache {
+                mode: CompactL0BypassPageCacheValidation,
+                merge_iter: MergeIterator<'a>,
+                all_keys_iter: VecIter<'a>,
+            },
+        }
+        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
+        impl AllValuesIter<'_> {
+            async fn next_all_keys_iter(
+                iter: &mut VecIter<'_>,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                let Some(DeltaEntry {
+                    key,
+                    lsn,
+                    val: value_ref,
+                    ..
+                }) = iter.next()
+                else {
+                    return Ok(None);
+                };
+                let value = value_ref.load(ctx).await?;
+                Ok(Some((*key, *lsn, value)))
+            }
+            async fn next(
+                &mut self,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                match self {
+                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
+                      Self::next_all_keys_iter(iter, ctx).await
+                    }
+                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
+                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
+                        // advance both iterators
+                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
+                        let merge_iter_item = merge_iter.next().await;
+                        // compare results & log warnings as needed
+                        macro_rules! rate_limited_warn {
+                            ($($arg:tt)*) => {{
+                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
+                                    warn!($($arg)*);
+                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
+                                }
+                                use once_cell::sync::Lazy;
+                                use utils::rate_limit::RateLimit;
+                                use std::sync::Mutex;
+                                use std::time::Duration;
+                                static LOGGED: Lazy<Mutex<RateLimit>> =
+                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                                let mut rate_limit = LOGGED.lock().unwrap();
+                                rate_limit.call(|| {
+                                    warn!($($arg)*);
+                                });
+                            }}
+                        }
+                        match (&all_keys_iter_item, &merge_iter_item) {
+                            (Err(_), Err(_)) => {
+                                // don't bother asserting equivality of the errors
+                            }
+                            (Err(all_keys), Ok(merge)) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
+                            },
+                            (Ok(all_keys), Err(merge)) => {
+                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
+                            },
+                            (Ok(None), Ok(None)) => { }
+                            (Ok(Some(all_keys)), Ok(None)) => {
+                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
+                            }
+                            (Ok(None), Ok(Some(merge))) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
+                            }
+                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
+                                match mode {
+                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
+                                    CompactL0BypassPageCacheValidation::KeyLsn => {
+                                        let all_keys = (all_keys_key, all_keys_lsn);
+                                        let merge = (merge_key, merge_lsn);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
+                                        }
+                                    }
+                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
+                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
+                                        let merge = (merge_key, merge_lsn, merge_value);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // in case of mismatch, trust the legacy all_keys_iter_item
+                        all_keys_iter_item
+                    }.instrument(info_span!("next")).await
+                }
+            }
+        }
+        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
+            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
+                all_keys_iter: all_keys.iter(),
+            },
+            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
+                let merge_iter = {
+                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+                    for l in deltas_to_compact.iter() {
+                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                        deltas.push(l);
+                    }
+                    MergeIterator::create(&deltas, &[], ctx)
+                };
+                match validate {
+                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
+                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
+                        mode: validate.clone(),
+                        merge_iter,
+                        all_keys_iter: all_keys.iter(),
+                    },
+                }
            }
-            MergeIterator::create(&deltas, &[], ctx)
        };

        // This iterator walks through all keys and is needed to calculate size used by each key
@@ -994,7 +1119,7 @@ impl Timeline {
        let mut keys = 0;

        while let Some((key, lsn, value)) = all_values_iter
-            .next()
+            .next(ctx)
            .await
            .map_err(CompactionError::Other)?
        {
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -33,7 +33,6 @@ use crate::virtual_file::{self, VirtualFile};
 pub struct BlobMeta {
    pub key: Key,
    pub lsn: Lsn,
-    pub will_init: bool,
 }

 /// Blob offsets into [`VectoredBlobsBuf::buf`]
@@ -356,8 +355,7 @@ pub enum BlobFlag {
 /// * Iterate over the collected blobs and coalesce them into reads at the end
 pub struct VectoredReadPlanner {
    // Track all the blob offsets. Start offsets must be ordered.
-    // Note: last bool is will_init
-    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

@@ -422,12 +420,12 @@ impl VectoredReadPlanner {
        match flag {
            BlobFlag::None => {
                let blobs_for_key = self.blobs.entry(key).or_default();
-                blobs_for_key.push((lsn, start_offset, end_offset, false));
+                blobs_for_key.push((lsn, start_offset, end_offset));
            }
            BlobFlag::ReplaceAll => {
                let blobs_for_key = self.blobs.entry(key).or_default();
                blobs_for_key.clear();
-                blobs_for_key.push((lsn, start_offset, end_offset, true));
+                blobs_for_key.push((lsn, start_offset, end_offset));
            }
            BlobFlag::Ignore => {}
        }
@@ -438,17 +436,11 @@ impl VectoredReadPlanner {
        let mut reads = Vec::new();

        for (key, blobs_for_key) in self.blobs {
-            for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
                let extended = match &mut current_read_builder {
-                    Some(read_builder) => read_builder.extend(
-                        start_offset,
-                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init,
-                        },
-                    ),
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
                    None => VectoredReadExtended::No,
                };

@@ -456,11 +448,7 @@ impl VectoredReadPlanner {
                    let next_read_builder = VectoredReadBuilder::new(
                        start_offset,
                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init,
-                        },
+                        BlobMeta { key, lsn },
                        self.max_read_size,
                        self.mode,
                    );
@@ -605,10 +593,8 @@ impl<'a> VectoredBlobReader<'a> {
    }
 }

-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
-///
-/// It provides a streaming API for getting read blobs. It returns a batch when
-/// `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
    read_builder: Option<VectoredReadBuilder>,
@@ -677,19 +663,10 @@ impl StreamingVectoredReadPlanner {
        start_offset: u64,
        end_offset: u64,
        is_last_blob_in_read: bool,
-        // destination: oneshot::Sender<Result<Bytes, std::io::Error>>,
    ) -> Option<VectoredRead> {
        match &mut self.read_builder {
            Some(read_builder) => {
-                let extended = read_builder.extend(
-                    start_offset,
-                    end_offset,
-                    BlobMeta {
-                        key,
-                        lsn,
-                        will_init: false,
-                    },
-                );
+                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
                assert_eq!(extended, VectoredReadExtended::Yes);
            }
            None => {
@@ -697,11 +674,7 @@ impl StreamingVectoredReadPlanner {
                    Some(VectoredReadBuilder::new_streaming(
                        start_offset,
                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init: false,
-                        },
+                        BlobMeta { key, lsn },
                        self.mode,
                    ))
                };
@@ -1033,7 +1006,6 @@ mod tests {
        let meta = BlobMeta {
            key: Key::MIN,
            lsn: Lsn(0),
-            will_init: false,
        };

        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,7 +1,6 @@
-//! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor.
 //!
-//! Instead, the file is opened when it's read from,
+//! VirtualFile is like a normal File, but it's not bound directly to
+//! a file descriptor. Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,12 +43,13 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;

-/// The real implementation that uses a Postgres process to
-/// perform WAL replay.
 ///
-/// Only one thread can use the process at a time, that is controlled by the
-/// Mutex. In the future, we might want to launch a pool of processes to allow
-/// concurrent replay of multiple records.
+/// This is the real implementation that uses a Postgres process to
+/// perform WAL replay. Only one thread can use the process at a time,
+/// that is controlled by the Mutex. In the future, we might want to
+/// launch a pool of processes to allow concurrent replay of multiple
+/// records.
+///
 pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,12 +1038,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if last_log_term on the node which gave
-			 * the highest vote (i.e. point where we are going to start writing)
-			 * actually had been won by me; plain restart of walproposer not
-			 * intervened by concurrent compute which wrote WAL is ok.
-			 *
-			 * This avoids compute crash after manual term_bump.
+			 * However, allow to proceed if previously elected leader was me;
+			 * plain restart of walproposer not intervened by concurrent
+			 * compute (who could generate WAL) is ok.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1445,17 +1442,12 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 *
-			 * Term has changed to higher one, probably another compute is
-			 * running. If this is the case we could PANIC as well because
-			 * likely it inserted some data and our basebackup is unsuitable
-			 * anymore. However, we also bump term manually (term_bump endpoint)
-			 * on safekeepers for migration purposes, in this case we do want
-			 * compute to stay alive. So restart walproposer with FATAL instead
-			 * of panicking; if basebackup is spoiled next election will notice
-			 * this.
+			 * Another compute with higher term is running. Panic to restart
+			 * PG as we likely need to retake basebackup. However, don't dump
+			 * core as this is kinda expected scenario.
 			 */
-			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			disable_core_dump();
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -16,7 +16,7 @@ use tracing::debug;
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};

-use super::{common::Cached, timed_lru, Cache};
+use super::{common::Cached, *};

 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,7 +38,10 @@ impl Api {
        locks: &'static ApiLocks<EndpointCacheKey>,
        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
    ) -> Self {
-        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
+        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+            Ok(v) => v,
+            Err(_) => String::new(),
+        };
        Self {
            endpoint,
            caches,
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -44,14 +44,16 @@
    clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
 // TODO: fix code and reduce list or move to permanent list above.
-#![expect(
+#![allow(
    clippy::cargo_common_metadata,
    clippy::cast_possible_truncation,
    clippy::cast_possible_wrap,
    clippy::cast_precision_loss,
    clippy::cast_sign_loss,
    clippy::doc_markdown,
+    clippy::implicit_hasher,
    clippy::inline_always,
    clippy::match_same_arms,
    clippy::match_wild_err_arm,
@@ -59,28 +61,21 @@
    clippy::missing_panics_doc,
    clippy::module_name_repetitions,
    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
    clippy::redundant_closure_for_method_calls,
+    clippy::return_self_not_must_use,
    clippy::similar_names,
    clippy::single_match_else,
    clippy::struct_excessive_bools,
    clippy::struct_field_names,
    clippy::too_many_lines,
-    clippy::unused_self
-)]
-#![cfg_attr(
-    any(test, feature = "testing"),
-    allow(
-        clippy::needless_raw_string_hashes,
-        clippy::unreadable_literal,
-        clippy::unused_async,
-    )
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::wildcard_imports
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(
-    unknown_lints,
-    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
-    impl_trait_overcaptures,
-)]
+#![allow(unknown_lints, clippy::manual_inspect)]

 use std::{convert::Infallible, future::Future};

--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -217,7 +217,6 @@ impl sasl::Mechanism for Exchange<'_> {
                        self.state = ExchangeState::SaltSent(sent);
                        Ok(Step::Continue(self, msg))
                    }
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                    Step::Success(x, _) => match x {},
                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
@@ -225,7 +224,6 @@ impl sasl::Mechanism for Exchange<'_> {
            ExchangeState::SaltSent(sent) => {
                match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                    Step::Continue(x, _) => match x {},
                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -745,20 +745,22 @@ impl BatchQueryData {
            builder = builder.deferrable(true);
        }

-        let transaction = builder.start().await.inspect_err(|_| {
+        let transaction = builder.start().await.map_err(|e| {
            // if we cannot start a transaction, we should return immediately
            // and not return to the pool. connection is clearly broken
            discard.discard();
+            e
        })?;

        let json_output =
            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                Ok(json_output) => {
                    info!("commit");
-                    let status = transaction.commit().await.inspect_err(|_| {
+                    let status = transaction.commit().await.map_err(|e| {
                        // if we cannot commit - for now don't return connection to pool
                        // TODO: get a query status from the error
                        discard.discard();
+                        e
                    })?;
                    discard.check_idle(status);
                    json_output
@@ -774,10 +776,11 @@ impl BatchQueryData {
                }
                Err(err) => {
                    info!("rollback");
-                    let status = transaction.rollback().await.inspect_err(|_| {
+                    let status = transaction.rollback().await.map_err(|e| {
                        // if we cannot rollback - for now don't return connection to pool
                        // TODO: get a query status from the error
                        discard.discard();
+                        e
                    })?;
                    discard.check_idle(status);
                    return Err(err);
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,7 +14,6 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;

 /// Stream wrapper which implements libpq's protocol.
-///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,7 +1,7 @@
 [toolchain]
-channel = "1.81.0"
+channel = "1.80.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
-# but we also need `llvm-tools` for coverage data merges on CI
-components = ["llvm-tools", "rustfmt", "clippy"]
+# but we also need `llvm-tools-preview` for coverage data merges on CI
+components = ["llvm-tools-preview", "rustfmt", "clippy"]
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,9 +1,6 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-/// If tenant_id is provided, allow if token (claims) is for this tenant or
-/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
-/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
        (Scope::Tenant, None) => Err(AuthError(
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use storage_broker::Uri;

 use tracing::*;
@@ -261,15 +261,6 @@ async fn main() -> anyhow::Result<()> {
    // Change into the data directory.
    std::env::set_current_dir(&workdir)?;

-    // Prevent running multiple safekeepers on the same directory
-    let lock_file_path = workdir.join(PID_FILE_NAME);
-    let lock_file =
-        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("claimed pid file at {lock_file_path:?}");
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    std::mem::forget(lock_file);
-
    // Set or read our ID.
    let id = set_id(&workdir, args.id.map(NodeId))?;
    if args.init {
@@ -373,15 +364,15 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;

 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
-    // fsync the datadir to make sure we have a consistent state on disk.
-    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
-    let started = Instant::now();
-    utils::crashsafe::syncfs(dfd)?;
-    let elapsed = started.elapsed();
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "syncfs data directory done"
-    );
+    // Prevent running multiple safekeepers on the same directory
+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("claimed pid file at {lock_file_path:?}");
+
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    std::mem::forget(lock_file);

    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;

 use postgres_ffi::WAL_SEGMENT_SIZE;
+use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
-use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
    auth::SwappableJwtAuth,
    http::{
@@ -408,28 +408,6 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
    json_response(StatusCode::OK, response)
 }

-/// Make term at least as high as one in request. If one in request is None,
-/// increment current one.
-async fn timeline_term_bump_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-    check_permission(&request, Some(ttid.tenant_id))?;
-
-    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
-
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let response = tli
-        .term_bump(request_data.term)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -652,10 +630,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
            |r| request_span(r, timeline_backup_partial_reset),
        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
-            |r| request_span(r, timeline_term_bump_handler),
-        )
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
        })
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,7 +484,6 @@ pub async fn validate_temp_timeline(
 }

 /// Move timeline from a temp directory to the main storage, and load it to the global map.
-///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,10 +448,8 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);

 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx.
-///
-/// Reading from socket and writing to disk in parallel is beneficial for
-/// performance, this struct provides the writing to disk part.
+/// replies to reply_tx; reading from socket and writing to disk in parallel is
+/// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
    tli: WalResidentTimeline,
    msg_rx: Receiver<ProposerAcceptorMessage>,
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -938,9 +938,8 @@ where
        }

        trace!(
-            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
+            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
            msg.wal_data.len(),
-            msg.h.begin_lsn,
            msg.h.end_lsn,
            msg.h.commit_lsn,
            msg.h.truncate_lsn,
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -758,8 +758,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                // pq_sendint32(&reply_message, xmin);
                // pq_sendint32(&reply_message, xmin_epoch);
                // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
-                hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
+                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
+                hs_feedback.catalog_xmin =
+                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                self.ws_guard
                    .walsenders
                    .record_hs_feedback(self.ws_guard.id, &hs_feedback);
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,10 +1,9 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).

-use std::{cmp::max, ops::Deref};
+use std::ops::Deref;

 use anyhow::Result;
-use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -13,7 +12,7 @@ use utils::{

 use crate::{
    control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
    wal_backup_partial::{self},
 };

@@ -148,11 +147,9 @@ pub struct TimelineMemState {
    pub proposer_uuid: PgUuid,
 }

-/// Safekeeper persistent state plus in memory layer.
-///
-/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
-/// which don't need immediate persistence. Provides transactional like API
-/// to atomically update the state.
+/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
+/// when we update fields like commit_lsn which don't need immediate
+/// persistence. Provides transactional like API to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
@@ -212,27 +209,6 @@ where
        let s = self.start_change();
        self.finish_change(&s).await
    }
-
-    /// Make term at least as `to`. If `to` is None, increment current one. This
-    /// is not in safekeeper.rs because we want to be able to do it even if
-    /// timeline is offloaded.
-    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        let before = self.acceptor_state.term;
-        let mut state = self.start_change();
-        let new = match to {
-            Some(to) => max(state.acceptor_state.term, to),
-            None => state.acceptor_state.term + 1,
-        };
-        if new > state.acceptor_state.term {
-            state.acceptor_state.term = new;
-            self.finish_change(&state).await?;
-        }
-        let after = self.acceptor_state.term;
-        Ok(TimelineTermBumpResponse {
-            previous_term: before,
-            current_term: after,
-        })
-    }
 }

 impl<CTRL> Deref for TimelineState<CTRL>
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,6 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
-use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -170,7 +169,6 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }

 /// This structure is stored in shared state and represents the state of the timeline.
-///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
@@ -216,10 +214,6 @@ impl StateSK {
            .get_last_log_term(self.flush_lsn())
    }

-    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        self.state_mut().term_bump(to).await
-    }
-
    /// Close open WAL files to release FDs.
    fn close_wal_store(&mut self) {
        if let StateSK::Loaded(sk) = self {
@@ -859,11 +853,6 @@ impl Timeline {
        Ok(res)
    }

-    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        let mut state = self.write_shared_state().await;
-        state.sk.term_bump(to).await
-    }
-
    /// Get the timeline guard for reading/writing WAL files.
    /// If WAL files are not present on disk (evicted), they will be automatically
    /// downloaded from remote storage. This is done in the manager task, which is
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,8 +1,6 @@
-//! Code related to evicting WAL files to remote storage.
-//!
-//! The actual upload is done by the partial WAL backup code. This file has
-//! code to delete and re-download WAL files, cross-validate with partial WAL
-//! backup if local file is still present.
+//! Code related to evicting WAL files to remote storage. The actual upload is done by the
+//! partial WAL backup code. This file has code to delete and re-download WAL files,
+//! cross-validate with partial WAL backup if local file is still present.

 use anyhow::Context;
 use camino::Utf8PathBuf;
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,6 +1,4 @@
-//! Timeline residence guard
-//!
-//! It is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.

--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,5 +1,4 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
-//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,8 +60,7 @@ impl TimelinesSet {
    }
 }

-/// Guard is used to add or remove timelines from the set.
-///
+/// Guard is used to add or remove timeline from the set.
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,8 +1,6 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates.
-//!
-//! After the partial segment was updated (`flush_lsn` was changed), the segment
-//! will be uploaded to S3 within the configured `partial_backup_timeout`.
+//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
+//! was changed), the segment will be uploaded to S3 in about 15 minutes.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,7 +17,6 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -98,19 +98,7 @@ pub struct PhysicalStorage {
    /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
    write_lsn: Lsn,

-    /// The LSN of the last WAL record written to disk. Still can be not fully
-    /// flushed.
-    ///
-    /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
-    /// switch ingest the reverse is true because we don't bump write_lsn up to
-    /// the next segment: WAL stream from the compute doesn't have the gap and
-    /// for simplicity / as a sanity check we disallow any non-sequential
-    /// writes, so write zeros as is.
-    ///
-    /// Similar effect is in theory possible due to LSN alignment: if record
-    /// ends at *2, decoder will report end lsn as *8 even though we haven't
-    /// written these zeros yet. In practice compute likely never sends
-    /// non-aligned chunks of data.
+    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
    write_record_lsn: Lsn,

    /// The LSN of the last WAL record flushed to disk.
@@ -179,7 +167,8 @@ impl PhysicalStorage {
            )
        };

-        // note: this assumes we fsync'ed whole datadir on start.
+        // TODO: do we really know that write_lsn is fully flushed to disk?
+        //      If not, maybe it's better to call fsync() here to be sure?
        let flush_lsn = write_lsn;

        debug!(
@@ -451,12 +440,11 @@ impl Storage for PhysicalStorage {
            .with_label_values(&["truncate_wal"])
            .start_timer();

-        // Streaming must not create a hole, so truncate cannot be called on
-        // non-written lsn.
-        if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
+        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
+        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
            bail!(
-                "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
-                self.write_record_lsn,
+                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
+                self.write_lsn,
                end_pos
            );
        }
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -134,7 +134,7 @@ class LLVM:
            # Show a user-friendly warning
            raise Exception(' '.join([
                f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools`,",
+                "Please execute `rustup component add llvm-tools-preview`,",
                "or install it via your package manager of choice.",
                "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
            ]))
@@ -518,7 +518,7 @@ def main() -> None:
    example = f"""
 prerequisites:
    # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools
+    rustup component add llvm-tools-preview

 self-contained example:
    {app} run make
--- a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
@@ -1 +0,0 @@
-ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
--- a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
@@ -1 +0,0 @@
-ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
--- a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -1 +0,0 @@
-ALTER TABLE tenant_shards DROP preferred_az_id;
--- a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -1 +0,0 @@
-ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,14 +14,14 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    ShardsPreferredAzsRequest, TenantCreateRequest,
+    TenantCreateRequest,
 };
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::{mgmt_api, BlockUnblock};
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -369,23 +369,6 @@ async fn handle_tenant_timeline_detach_ancestor(
    json_response(StatusCode::OK, res)
 }

-async fn handle_tenant_timeline_block_unblock_gc(
-    service: Arc<Service>,
-    req: Request<Body>,
-    dir: BlockUnblock,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
-    service
-        .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handle_tenant_timeline_passthrough(
    service: Arc<Service>,
    req: Request<Body>,
@@ -556,17 +539,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, node_status)
 }

-async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    let node_status = state.service.get_node_shards(node_id).await?;
-
-    json_response(StatusCode::OK, node_status)
-}
-
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -716,18 +688,6 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
    )
 }

-async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state.service.update_shards_preferred_azs(azs_req).await?,
-    )
-}
-
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -1137,13 +1097,6 @@ pub fn make_router(
        .get("/control/v1/node/:node_id", |r| {
            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
        })
-        .get("/control/v1/node/:node_id/shards", |r| {
-            named_request_span(
-                r,
-                handle_node_shards,
-                RequestName("control_v1_node_describe"),
-            )
-        })
        .get("/control/v1/leader", |r| {
            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
        })
@@ -1221,13 +1174,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_policy"),
            )
        })
-        .put("/control/v1/preferred_azs", |r| {
-            named_request_span(
-                r,
-                handle_update_preferred_azs,
-                RequestName("control_v1_preferred_azs"),
-            )
-        })
        .put("/control/v1/step_down", |r| {
            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
        })
@@ -1309,26 +1255,6 @@ pub fn make_router(
                )
            },
        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
-                    RequestName("v1_tenant_timeline_block_unblock_gc"),
-                )
-            },
-        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
-                    RequestName("v1_tenant_timeline_block_unblock_gc"),
-                )
-            },
-        )
        // Tenant detail GET passthrough to shard zero:
        .get("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,7 +36,7 @@ pub(crate) struct Node {
    listen_pg_addr: String,
    listen_pg_port: u16,

-    availability_zone_id: String,
+    availability_zone_id: Option<String>,

    // This cancellation token means "stop any RPCs in flight to this node, and don't start
    // any more". It is not related to process shutdown.
@@ -63,9 +63,8 @@ impl Node {
        self.id
    }

-    #[allow(unused)]
-    pub(crate) fn get_availability_zone_id(&self) -> &str {
-        self.availability_zone_id.as_str()
+    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
+        self.availability_zone_id.as_deref()
    }

    pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -79,12 +78,22 @@ impl Node {
    /// Does this registration request match `self`?  This is used when deciding whether a registration
    /// request should be allowed to update an existing record with the same node ID.
    pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        self.id == register_req.node_id
+        let az_ids_match = {
+            match (
+                self.availability_zone_id.as_deref(),
+                register_req.availability_zone_id.as_deref(),
+            ) {
+                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
+                _ => true,
+            }
+        };
+
+        az_ids_match
+            && self.id == register_req.node_id
            && self.listen_http_addr == register_req.listen_http_addr
            && self.listen_http_port == register_req.listen_http_port
            && self.listen_pg_addr == register_req.listen_pg_addr
            && self.listen_pg_port == register_req.listen_pg_port
-            && self.availability_zone_id == register_req.availability_zone_id
    }

    /// For a shard located on this node, populate a response object
@@ -181,7 +190,7 @@ impl Node {
        listen_http_port: u16,
        listen_pg_addr: String,
        listen_pg_port: u16,
-        availability_zone_id: String,
+        availability_zone_id: Option<String>,
    ) -> Self {
        Self {
            id,
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -7,10 +7,7 @@ use pageserver_api::{
    },
    shard::TenantShardId,
 };
-use pageserver_client::{
-    mgmt_api::{Client, Result},
-    BlockUnblock,
-};
+use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -261,24 +258,6 @@ impl PageserverClient {
        )
    }

-    pub(crate) async fn timeline_block_unblock_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<()> {
-        // measuring these makes no sense because we synchronize with the gc loop and remote
-        // storage on block_gc so there should be huge outliers
-        measured_request!(
-            "timeline_block_unblock_gc",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner
-                .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
-                .await
-        )
-    }
-
    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
        measured_request!(
            "utilization",
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,7 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealthOutdated,
    GetLeader,
    UpdateLeader,
-    SetPreferredAzs,
+    SetNodeAzId,
 }

 #[must_use]
@@ -325,6 +325,31 @@ impl Persistence {
        }
    }

+    pub(crate) async fn set_node_availability_zone_id(
+        &self,
+        input_node_id: NodeId,
+        input_az_id: String,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((availability_zone_id.eq(input_az_id.clone()),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for setting az id",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -665,33 +690,6 @@ impl Persistence {
        Ok(())
    }

-    pub(crate) async fn set_tenant_shard_preferred_azs(
-        &self,
-        preferred_azs: Vec<(TenantShardId, String)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
-
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az))
-                    .execute(conn)?;
-
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
-                }
-            }
-
-            Ok(shards_updated)
-        })
-        .await
-    }
-
    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1078,11 +1076,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) config: String,
    #[serde(default)]
    pub(crate) scheduling_policy: String,
-
-    // Hint that we should attempt to schedule this tenant shard the given
-    // availability zone in order to minimise the chances of cross-AZ communication
-    // with compute.
-    pub(crate) preferred_az_id: Option<String>,
 }

 impl TenantShardPersistence {
@@ -1117,7 +1110,7 @@ pub(crate) struct NodePersistence {
    pub(crate) listen_http_port: i32,
    pub(crate) listen_pg_addr: String,
    pub(crate) listen_pg_port: i32,
-    pub(crate) availability_zone_id: String,
+    pub(crate) availability_zone_id: Option<String>,
 }

 /// Tenant metadata health status that are stored durably.
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,7 +528,7 @@ pub(crate) mod test_utils {
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
-                        "test-az".to_string(),
+                        None,
                    );
                    node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                    assert!(node.is_available());
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;`
				`@@ -1 +0,0 @@`
				`ALTER TABLE tenant_shards DROP preferred_az_id;`
				`@@ -1 +0,0 @@`
				`ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;`