kick ci with label

bench: sync before benchies
the first benchmark repeatedly fails: <https://neon-github-public-dev.s3.amazonaws.com/reports/main/10743838300/index.html#behaviors/b1a8273437954620fa374b796ffaacdd/6e88523163820fdd/>
2026-06-01 20:40:37 +00:00 · 2024-09-06 21:05:43 +00:00 · 2024-09-06 21:04:29 +00:00
96 changed files with 721 additions and 2188 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,13 +7,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - AZURE_DEV_CLIENT_ID
-  - AZURE_DEV_REGISTRY_NAME
-  - AZURE_DEV_SUBSCRIPTION_ID
-  - AZURE_PROD_CLIENT_ID
-  - AZURE_PROD_REGISTRY_NAME
-  - AZURE_PROD_SUBSCRIPTION_ID
-  - AZURE_TENANT_ID
  - BENCHMARK_PROJECT_ID_PUB
  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -1,56 +0,0 @@
-name: Push images to ACR
-on:
-  workflow_call:
-    inputs:
-      client_id:
-        description: Client ID of Azure managed identity or Entra app
-        required: true
-        type: string
-      image_tag:
-        description: Tag for the container image
-        required: true
-        type: string
-      images:
-        description: Images to push
-        required: true
-        type: string
-      registry_name:
-        description: Name of the container registry
-        required: true
-        type: string
-      subscription_id:
-        description: Azure subscription ID
-        required: true
-        type: string
-      tenant_id:
-        description: Azure tenant ID
-        required: true
-        type: string
-
-jobs:
-  push-to-acr:
-    runs-on: ubuntu-22.04
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
-
-    steps:
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ inputs.client_id }}
-          subscription-id: ${{ inputs.subscription_id }}
-          tenant-id: ${{ inputs.tenant_id }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=${{ inputs.registry_name }}
-
-      - name: Copy docker images to ACR ${{ inputs.registry_name }}
-        run: |
-          images='${{ inputs.images }}'
-          for image in ${images}; do
-            docker buildx imagetools create \
-              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
-          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -271,6 +271,10 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4

+      - name: sync(1)
+        shell: bash -euxo pipefail {0}
+        run: sync
+
      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
        with:
@@ -794,6 +798,9 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -820,6 +827,28 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

+      - name: Azure login
+        if: github.ref_name == 'main'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: github.ref_name == 'main'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Copy docker images to ACR-dev
+        if: github.ref_name == 'main'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create \
+              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
+                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
@@ -857,30 +886,6 @@ jobs:
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done

-  push-to-acr-dev:
-    if: github.ref_name == 'main'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
-  push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
    runs-on: ubuntu-22.04
@@ -956,8 +961,8 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,11 +7,6 @@ on:
  pull_request_target:
    types:
      - opened
-  workflow_dispatch:
-    inputs:
-      github-actor:
-        description: 'GitHub username. If empty, the username of the current user will be used'
-        required: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -31,31 +26,12 @@ jobs:
      id: check-user
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        ACTOR: ${{ inputs.github-actor || github.actor }}
      run: |
-        expected_error="User does not exist or is not a member of the organization"
-        output_file=output.txt
-
-        for i in $(seq 1 10); do
-          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
-
-            is_member=true
-            break
-          elif grep -q "${expected_error}" ${output_file}; then
-            is_member=false
-            break
-          elif [ $i -eq 10 ]; then
-            title="Failed to get memmbership status for ${ACTOR}"
-            message="The latest GitHub API error message: '$(cat ${output_file})'"
-            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
-
-            exit 1
-          fi
-
-          sleep 1
-        done
+        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
+          is_member=true
+        else
+          is_member=false
+        fi

        echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    export PATH="$HOME/.cargo/bin:$PATH" && \
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
-    rustup component add llvm-tools rustfmt clippy && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,10 +22,9 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Escape a string for including it in a SQL literal.
-///
-/// Wrapping the result with `E'{}'` or `'{}'` is not required,
-/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal. Wrapping the result
+/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
+/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -80,10 +80,7 @@ enum Command {
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
-    Tenants {
-        /// If this field is set, it will list the tenants on a specific node
-        node_id: Option<NodeId>,
-    },
+    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
@@ -406,41 +403,7 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::Tenants {
-            node_id: Some(node_id),
-        } => {
-            let describe_response = storcon_client
-                .dispatch::<(), NodeShardResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}/shards"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "Shard",
-                "Intended Primary/Secondary",
-                "Observed Primary/Secondary",
-            ]);
-            for shard in shards {
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    match shard.is_intended_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                    match shard.is_observed_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::Tenants { node_id: None } => {
+        Command::Tenants {} => {
            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,7 +68,6 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);

 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
-///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -74,17 +74,6 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsRequest {
-    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, String>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsResponse {
-    pub updated: Vec<TenantShardId>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -112,21 +101,6 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShardResponse {
-    pub node_id: NodeId,
-    pub shards: Vec<NodeShard>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShard {
-    pub tenant_shard_id: TenantShardId,
-    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_observed_secondary: Option<bool>,
-    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_intended_secondary: Option<bool>,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
@@ -158,12 +132,8 @@ pub struct TenantDescribeResponseShard {
    pub is_splitting: bool,

    pub scheduling_policy: ShardSchedulingPolicy,
-
-    pub preferred_az_id: Option<String>,
 }

-/// Migration request for a given tenant shard to a given node.
-///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,10 +305,8 @@ pub struct TenantConfig {
    pub lsn_lease_length_for_ts: Option<String>,
 }

-/// The policy for the aux file storage.
-///
-/// It can be switched through `switch_aux_file_policy` tenant config.
-/// When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -898,9 +896,7 @@ pub struct WalRedoManagerStatus {
    pub process: Option<WalRedoManagerProcessStatus>,
 }

-/// The progress of a secondary tenant.
-///
-/// It is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,10 +69,8 @@ impl QueryError {
 }

 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection.
-///
-/// These errors can happen during normal operations,
-/// and don't indicate a bug in our code.
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,7 +7,6 @@ use std::fmt;
 use url::Host;

 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
-///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,8 +45,6 @@ pub use azure_core::Etag;

 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

-/// Default concurrency limit for S3 operations
-///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -302,9 +300,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<(), TimeTravelError>;
 }

-/// Data part of an ongoing [`Download`].
-///
-/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,16 +60,3 @@ pub struct TimelineCopyRequest {
    pub target_timeline_id: TimelineId,
    pub until_lsn: Lsn,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpRequest {
-    /// bump to
-    pub term: Option<u64>,
-}
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpResponse {
-    // before the request
-    pub previous_term: u64,
-    pub current_term: u64,
-}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,10 +5,9 @@
 mod calculation;
 pub mod svg;

-/// StorageModel is the input to the synthetic size calculation.
-///
-/// It represents a tree of timelines, with just the information that's needed
-/// for the calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,10 +5,8 @@ use std::{

 use metrics::IntCounter;

-/// Circuit breakers are for operations that are expensive and fallible.
-///
-/// If a circuit breaker fails repeatedly, we will stop attempting it for some
-/// period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
    /// An identifier that enables us to log useful errors when a circuit is broken
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,4 +1,3 @@
-use std::os::fd::AsRawFd;
 use std::{
    borrow::Cow,
    fs::{self, File},
@@ -204,27 +203,6 @@ pub fn overwrite(
    Ok(())
 }

-/// Syncs the filesystem for the given file descriptor.
-#[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
-    // Linux guarantees durability for syncfs.
-    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-    #[cfg(target_os = "linux")]
-    {
-        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
-    }
-    #[cfg(target_os = "macos")]
-    {
-        // macOS is not a production platform for Neon, don't even bother.
-    }
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        compile_error!("Unsupported OS");
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,10 +249,8 @@ macro_rules! id_newtype {
    };
 }

-/// Neon timeline ID.
-///
-/// They are different from PostgreSQL timeline
-/// IDs, but serve a similar purpose: they differentiate
+/// Neon timeline IDs are different from PostgreSQL timeline
+/// IDs. They serve a similar purpose though: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,9 +100,7 @@ pub enum LockFileRead {
 }

 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content.
-///
-/// It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
 }

 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
    // following rust 1.66.1 std implementation:
    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
    let location = info.location();
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,7 +8,6 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;

 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
-///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,8 +65,6 @@ impl<T> Poison<T> {
    }
 }

-/// Armed pointer to a [`Poison`].
-///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,11 +13,10 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);

-/// Combination of ShardNumber and ShardCount.
-///
-/// For use within the context of a particular tenant, when we need to know which shard we're
-/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
-/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,11 +49,12 @@ use std::sync::{RwLock, RwLockWriteGuard};

 use tokio::sync::watch;

-/// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).
 ///
-/// Storing to the Rcu updates the value, making new readers immediately see
-/// the new value, but it also waits for all current readers to finish.
+/// Rcu allows multiple readers to read and hold onto a value without blocking
+/// (for very long).  Storing to the Rcu updates the value, making new readers
+/// immediately see the new value, but it also waits for all current readers to
+/// finish.
+///
 pub struct Rcu<V> {
    inner: RwLock<RcuInner<V>>,
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,9 +5,7 @@ use std::sync::{
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`.
-///
-/// Allows use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,7 +7,6 @@ pub enum VecMapOrdering {
 }

 /// Ordered map datastructure implemented in a Vec.
-///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,10 +6,9 @@ pub enum YieldingLoopError {
    Cancelled,
 }

-/// Helper for long synchronous loops, e.g. over all tenants in the system.
-///
-/// Periodically yields to avoid blocking the executor, and after resuming
-/// checks the provided cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
    interval: usize,
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,20 +1,2 @@
 pub mod mgmt_api;
 pub mod page_service;
-
-/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
-// If file structure is per-kind not per-feature then where to put this?
-#[derive(Clone, Copy)]
-pub enum BlockUnblock {
-    Block,
-    Unblock,
-}
-
-impl std::fmt::Display for BlockUnblock {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let s = match self {
-            BlockUnblock::Block => "block",
-            BlockUnblock::Unblock => "unblock",
-        };
-        f.write_str(s)
-    }
-}
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,8 +12,6 @@ use utils::{

 pub use reqwest::Body as ReqwestBody;

-use crate::BlockUnblock;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -456,20 +454,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn timeline_block_unblock_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
-            self.mgmt_api_endpoint,
-        );
-
-        self.request(Method::POST, &uri, ()).await.map(|_| ())
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -142,16 +142,11 @@ impl PagestreamClient {
    ) -> anyhow::Result<PagestreamGetPageResponse> {
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));

-        for i in 0..10 {
-            let mut req = tokio_stream::once(Ok(req.clone()));
-            self.copy_both.send_all(&mut req).await?;
-        }
+        self.copy_both.send_all(&mut req).await?;

-        for i in 0..9 {
-            let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
-            let next: bytes::Bytes = next.unwrap()?;
-        }
        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
        let next: bytes::Bytes = next.unwrap()?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -37,7 +37,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -156,7 +155,23 @@ fn main() -> anyhow::Result<()> {
        };

        let started = Instant::now();
-        syncfs(dirfd)?;
+        // Linux guarantees durability for syncfs.
+        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+        #[cfg(target_os = "linux")]
+        {
+            use std::os::fd::AsRawFd;
+            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
+        }
+        #[cfg(target_os = "macos")]
+        {
+            // macOS is not a production platform for Neon, don't even bother.
+            drop(dirfd);
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            compile_error!("Unsupported OS");
+        }
+
        let elapsed = started.elapsed();
        info!(
            elapsed_ms = elapsed.as_millis(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -180,8 +180,6 @@ pub struct PageServerConf {
    pub io_buffer_alignment: usize,
 }

-/// Token for authentication to safekeepers
-///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,9 +1,7 @@
-//! Defines [`RequestContext`].
-//!
-//! It is a structure that we use throughout the pageserver to propagate
-//! high-level context from places that _originate_ activity down to the
-//! shared code paths at the heart of the pageserver. It's inspired by
-//! Golang's `context.Context`.
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1185,7 +1185,6 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
-    count: usize,
 }

 impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
@@ -1213,11 +1212,9 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        for _ in 0..self.count {
-            self.global_metric.observe(ex_throttled.as_secs_f64());
-            if let Some(timeline_metric) = self.timeline_metric {
-                timeline_metric.observe(ex_throttled.as_secs_f64());
-            }
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1346,14 +1343,6 @@ impl SmgrQueryTimePerTimeline {
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
-        self.start_timer_many(op, 1, ctx)
-    }
-    pub(crate) fn start_timer_many<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        count: usize,
-        ctx: &'c RequestContext,
    ) -> Option<impl Drop + '_> {
        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
@@ -1387,7 +1376,6 @@ impl SmgrQueryTimePerTimeline {
            ctx,
            start,
            op,
-            count,
        })
    }
 }
@@ -3182,16 +3170,6 @@ static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .unwrap()
 });

-pub(crate) static CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM: Lazy<Histogram> =
-    Lazy::new(|| {
-        register_histogram!(
-            "pageserver_consecutive_nonblocking_getpage_requests",
-            "Number of consecutive nonblocking getpage requests",
-            (0..=256).map(|x| x as f64).collect::<Vec<f64>>(),
-        )
-        .unwrap()
-    });
-
 pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
    let _guard = SERIALIZE.lock().unwrap();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -5,14 +5,14 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
-use once_cell::sync::{Lazy, OnceCell};
-use pageserver_api::models::{self, TenantState};
+use once_cell::sync::OnceCell;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
-    PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
@@ -43,7 +43,7 @@ use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self, CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM};
+use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -58,7 +58,7 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -577,317 +577,124 @@ impl PageServerHandler {
            }
        }

-        let mut batched = None;
-        'outer: loop {
-            enum DebouncedFeMessage {
-                Exists(models::PagestreamExistsRequest),
-                Nblocks(models::PagestreamNblocksRequest),
-                GetPage {
-                    span: Span,
-                    shard: timeline::handle::Handle<TenantManagerTypes>,
-                    effective_request_lsn: Lsn,
-                    pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
-                },
-                DbSize(models::PagestreamDbSizeRequest),
-                GetSlruSegment(models::PagestreamGetSlruSegmentRequest),
-                RespondError(Span, PageStreamError),
-            }
-            let mut debounce: Option<std::time::Instant> = None;
-            // return or `?` on protocol error
-            // `break EXPR` to stop batching. The EXPR will be the first message in the next batch.
-            let next_batched: Option<DebouncedFeMessage> = loop {
-                static BOUNCE_TIMEOUT: Lazy<Duration> = Lazy::new(|| {
-                    utils::env::var::<humantime::Duration, _>("NEON_PAGESERVER_DEBOUNCE")
-                        .unwrap()
-                        .into()
-                });
-                let sleep_fut = if let Some(started_at) = debounce {
-                    futures::future::Either::Left(tokio::time::sleep_until(
-                        (started_at + *BOUNCE_TIMEOUT).into(),
-                    ))
-                } else {
-                    futures::future::Either::Right(futures::future::pending())
-                };
-                let msg = tokio::select! {
-                    biased;
-                    _ = self.cancel.cancelled() => {
-                        return Err(QueryError::Shutdown)
-                    }
-                    msg = pgb.read_message() => {
-                        msg
-                    }
-                    _ = sleep_fut => {
-                        assert!(batched.is_some());
-                        break None;
-                    }
-                };
-                let copy_data_bytes = match msg? {
-                    Some(FeMessage::CopyData(bytes)) => bytes,
-                    Some(FeMessage::Terminate) => break 'outer,
-                    Some(m) => {
-                        return Err(QueryError::Other(anyhow::anyhow!(
-                            "unexpected message: {m:?} during COPY"
-                        )));
-                    }
-                    None => break 'outer, // client disconnected
-                };
-                trace!("query: {copy_data_bytes:?}");
-                fail::fail_point!("ps::handle-pagerequest-message");
-
-                // parse request
-                let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
-
-                let this_msg = match neon_fe_msg {
-                    PagestreamFeMessage::Exists(msg) => DebouncedFeMessage::Exists(msg),
-                    PagestreamFeMessage::Nblocks(msg) => DebouncedFeMessage::Nblocks(msg),
-                    PagestreamFeMessage::DbSize(msg) => DebouncedFeMessage::DbSize(msg),
-                    PagestreamFeMessage::GetSlruSegment(msg) => {
-                        DebouncedFeMessage::GetSlruSegment(msg)
-                    }
-                    PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                        request_lsn,
-                        not_modified_since,
-                        rel,
-                        blkno,
-                    }) => {
-                        let span = tracing::info_span!("handle_get_page_at_lsn_request_batched", %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, batch_size = tracing::field::Empty);
-                        let key = rel_block_to_key(rel, blkno);
-                        let shard = match self
-                            .timeline_handles
-                            .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                            .instrument(span.clone())
-                            .await
-                        {
-                            Ok(tl) => tl,
-                            Err(GetActiveTimelineError::Tenant(
-                                GetActiveTenantError::NotFound(_),
-                            )) => {
-                                // We already know this tenant exists in general, because we resolved it at
-                                // start of connection.  Getting a NotFound here indicates that the shard containing
-                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                                // mapping is out of date.
-                                //
-                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                                // and talk to a different pageserver.
-                                break Some(DebouncedFeMessage::RespondError(
-                                    span,
-                                    PageStreamError::Reconnect(
-                                        "getpage@lsn request routed to wrong shard".into(),
-                                    ),
-                                ));
-                            }
-                            Err(e) => break Some(DebouncedFeMessage::RespondError(span, e.into())),
-                        };
-                        let effective_request_lsn = match Self::wait_or_get_last_lsn(
-                            &shard,
-                            request_lsn,
-                            not_modified_since,
-                            &shard.get_latest_gc_cutoff_lsn(),
-                            &ctx,
-                        )
-                        // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                        .await
-                        {
-                            Ok(lsn) => lsn,
-                            Err(e) => {
-                                break Some(DebouncedFeMessage::RespondError(span, e));
-                            }
-                        };
-                        DebouncedFeMessage::GetPage {
-                            span,
-                            shard,
-                            effective_request_lsn,
-                            pages: smallvec::smallvec![(rel, blkno)],
-                        }
-                    }
-                };
-
-                // check if we can debounce
-                match (&mut batched, this_msg) {
-                    (None, this_msg) => {
-                        batched = Some(this_msg);
-                    }
-                    (
-                        Some(DebouncedFeMessage::GetPage {
-                            span: _,
-                            shard: accum_shard,
-                            pages: accum_pages,
-                            effective_request_lsn: accum_lsn,
-                        }),
-                        DebouncedFeMessage::GetPage {
-                            span: _,
-                            shard: this_shard,
-                            pages: this_pages,
-                            effective_request_lsn: this_lsn,
-                        },
-                    ) if async {
-                        assert_eq!(this_pages.len(), 1);
-                        if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
-                            assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
-                            return false;
-                        }
-                        if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                            != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                        {
-                            // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                            // But the current logig for keeping responses in order does not support that.
-                            return false;
-                        }
-                        // the vectored get currently only supports a single LSN, so, bounce as soon
-                        // as the effective request_lsn changes
-                        return *accum_lsn == this_lsn;
-                    }
-                    .await =>
-                    {
-                        // ok to batch
-                        accum_pages.extend(this_pages);
-                    }
-                    (Some(_), this_msg) => {
-                        // by default, don't continue batching
-                        break Some(this_msg);
-                    }
+        loop {
+            // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
+            let msg = tokio::select! {
+                biased;
+                _ = self.cancel.cancelled() => {
+                    return Err(QueryError::Shutdown)
                }
-
-                // debounce impl piece
-                let started_at = debounce.get_or_insert_with(Instant::now);
-                if started_at.elapsed() > *BOUNCE_TIMEOUT {
-                    break None;
+                msg = pgb.read_message() => { msg }
+            };
+            let copy_data_bytes = match msg? {
+                Some(FeMessage::CopyData(bytes)) => bytes,
+                Some(FeMessage::Terminate) => break,
+                Some(m) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
                }
+                None => break, // client disconnected
            };

+            trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");
+
+            // parse request
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
            // invoke handler function
-            let (handler_results, span): (
-                smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]>,
-                _,
-            ) = match batched.take().expect("loop above ensures this") {
-                DebouncedFeMessage::Exists(req) => {
+            let (handler_result, span) = match neon_fe_msg {
+                PagestreamFeMessage::Exists(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await
-                        ],
+                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::Nblocks(req) => {
+                PagestreamFeMessage::Nblocks(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
+                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::GetPage {
-                    span,
-                    shard,
-                    effective_request_lsn,
-                    pages,
-                } => {
-                    CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM.observe(pages.len() as f64);
-                    span.record("batch_size", pages.len() as u64);
+                PagestreamFeMessage::GetPage(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
-                        {
-                            let npages = pages.len();
-                            let res = self
-                                .handle_get_page_at_lsn_request_batched(
-                                    &shard,
-                                    effective_request_lsn,
-                                    pages,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await;
-                            assert_eq!(res.len(), npages);
-                            res
-                        },
+                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::DbSize(req) => {
+                PagestreamFeMessage::DbSize(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
-                                .instrument(span.clone())
-                                .await
-                        ],
+                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::GetSlruSegment(req) => {
+                PagestreamFeMessage::GetSlruSegment(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
-                        smallvec::smallvec![
-                            self.handle_get_slru_segment_request(
-                                tenant_id,
-                                timeline_id,
-                                &req,
-                                &ctx
-                            )
+                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
-                            .await
-                        ],
+                            .await,
                        span,
                    )
                }
-                DebouncedFeMessage::RespondError(span, e) => {
-                    // We've already decided to respond with an error, so we don't need to
-                    // call the handler.
-                    (smallvec::smallvec![Err(e)], span)
-                }
            };

            // Map handler result to protocol behavior.
            // Some handler errors cause exit from pagestream protocol.
            // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-            for handler_result in handler_results {
-                let response_msg = match handler_result {
-                    Err(e) => match &e {
-                        PageStreamError::Shutdown => {
-                            // If we fail to fulfil a request during shutdown, which may be _because_ of
-                            // shutdown, then do not send the error to the client.  Instead just drop the
-                            // connection.
-                            span.in_scope(|| info!("dropping connection due to shutdown"));
-                            return Err(QueryError::Shutdown);
-                        }
-                        PageStreamError::Reconnect(reason) => {
-                            span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                            return Err(QueryError::Reconnect);
-                        }
-                        PageStreamError::Read(_)
-                        | PageStreamError::LsnTimeout(_)
-                        | PageStreamError::NotFound(_)
-                        | PageStreamError::BadRequest(_) => {
-                            // print the all details to the log with {:#}, but for the client the
-                            // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                            // here includes cancellation which is not an error.
-                            let full = utils::error::report_compact_sources(&e);
-                            span.in_scope(|| {
-                                error!("error reading relation or page version: {full:#}")
-                            });
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                message: e.to_string(),
-                            })
-                        }
-                    },
-                    Ok(response_msg) => response_msg,
-                };
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    }
+                },
+                Ok(response_msg) => response_msg,
+            };

-                // marshal & transmit response message
-                pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-            }
+            // marshal & transmit response message
+            pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
            tokio::select! {
                biased;
                _ = self.cancel.cancelled() => {
@@ -899,9 +706,6 @@ impl PageServerHandler {
                    res?;
                }
            }
-
-            assert!(batched.is_none(), "we take() earlier");
-            batched = next_batched;
        }
        Ok(())
    }
@@ -1145,30 +949,60 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all)]
-    async fn handle_get_page_at_lsn_request_batched(
+    #[instrument(skip_all, fields(shard_id))]
+    async fn handle_get_page_at_lsn_request(
        &mut self,
-        timeline: &Timeline,
-        effective_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let _timer = timeline.query_metrics.start_timer_many(
-            metrics::SmgrQueryType::GetPageAtLsn,
-            pages.len(),
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = match self
+            .timeline_handles
+            .get(
+                tenant_id,
+                timeline_id,
+                ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
+            )
+            .await
+        {
+            Ok(tl) => tl,
+            Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                // We already know this tenant exists in general, because we resolved it at
+                // start of connection.  Getting a NotFound here indicates that the shard containing
+                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                // mapping is out of date.
+                //
+                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                // and talk to a different pageserver.
+                return Err(PageStreamError::Reconnect(
+                    "getpage@lsn request routed to wrong shard".into(),
+                ));
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
+
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(
+            &timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
            ctx,
-        );
+        )
+        .await?;

-        let pages = timeline
-            .get_rel_page_at_lsn_batched(pages, Version::Lsn(effective_lsn), ctx)
-            .await;
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
+            .await?;

-        smallvec::SmallVec::from_iter(pages.into_iter().map(|page| {
-            page.map(|page| {
-                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
-            })
-            .map_err(PageStreamError::Read)
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
        }))
    }

@@ -1665,10 +1499,3 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
    );
    debug_assert_current_span_has_tenant_and_timeline_id();
 }
-
-struct WaitedForLsn(Lsn);
-impl From<WaitedForLsn> for Lsn {
-    fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self {
-        lsn
-    }
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,17 +9,12 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::span::{
-    debug_assert_current_span_has_tenant_and_timeline_id,
-    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
-};
-use crate::tenant::timeline::GetVectoredError;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -33,7 +28,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
+use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
@@ -196,184 +191,26 @@ impl Timeline {
        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
-        let pages = smallvec::smallvec![(tag, blknum)];
-        let res = self.get_rel_page_at_lsn_batched(pages, version, ctx).await;
-        assert_eq!(res.len(), 1);
-        res.into_iter().next().unwrap()
-    }
-
-    /// Like [`get_rel_page_at_lsn`], but returns a batch of pages.
-    pub(crate) async fn get_rel_page_at_lsn_batched(
-        &self,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
-        version: Version<'_>,
-        ctx: &RequestContext,
-    ) -> smallvec::SmallVec<[Result<Bytes, PageReconstructError>; 1]> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let request_lsn = match version {
-            Version::Lsn(lsn) => lsn,
-            Version::Modified(_) => panic!("unsupported"),
-        };
-        enum KeyState {
-            NeedsVectoredGet,
-            Done(Result<Bytes, PageReconstructError>),
+        if tag.relnode == 0 {
+            return Err(PageReconstructError::Other(
+                RelationError::InvalidRelnode.into(),
+            ));
        }
-        let mut key_states = BTreeMap::new();
-        let mut vectored_gets: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(pages.len());
-        for (response_order, (tag, blknum)) in pages.into_iter().enumerate() {
-            let key = rel_block_to_key(tag, blknum);
-            use std::collections::btree_map::Entry;
-            let key_state_slot = match key_states.entry((key, response_order)) {
-                Entry::Occupied(_entry) => unreachable!(
-                    "enumerate makes keys unique, even if batch contains same key twice"
-                ),
-                Entry::Vacant(entry) => entry,
-            };

-            if tag.relnode == 0 {
-                key_state_slot.insert(KeyState::Done(Err(PageReconstructError::Other(
-                    RelationError::InvalidRelnode.into(),
-                ))));
-                continue;
-            }
-
-            let nblocks = match self.get_rel_size(tag, version, ctx).await {
-                Ok(nblocks) => nblocks,
-                Err(err) => {
-                    key_state_slot.insert(KeyState::Done(Err(err)));
-                    continue;
-                }
-            };
-            if blknum >= nblocks {
-                debug!(
-                    "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag,
-                    blknum,
-                    version.get_lsn(),
-                    nblocks
-                );
-                key_state_slot.insert(KeyState::Done(Ok(ZERO_PAGE.clone())));
-                continue;
-            }
-
-            vectored_gets.push(key);
-            key_state_slot.insert(KeyState::NeedsVectoredGet);
+        let nblocks = self.get_rel_size(tag, version, ctx).await?;
+        if blknum >= nblocks {
+            debug!(
+                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
+            );
+            return Ok(ZERO_PAGE.clone());
        }
-        // turn vectored_gets into a keyspace
-        let keyspace = {
-            // add_key reuqires monotonicity
-            vectored_gets.sort_unstable();
-            let mut acc = KeySpaceAccum::new();
-            for key in vectored_gets
-                .into_iter()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(key);
-            }
-            acc.to_keyspace()
-        };

-        match self.get_vectored(keyspace, request_lsn, ctx).await {
-            Ok(results) => {
-                for (key, res) in results {
-                    if let Err(err) = &res {
-                        warn!(%key, ?err, "a key inside get_vectored failed with a per-key error");
-                    }
-                    let mut interests = key_states.range_mut((key, 0)..(key.next(), 0)).peekable();
-                    let first_interest = interests.next().unwrap();
-                    let next_interest = interests.peek().is_some();
-                    if !next_interest {
-                        match first_interest.1 {
-                            KeyState::NeedsVectoredGet => {
-                                *first_interest.1 = KeyState::Done(res);
-                            }
-                            KeyState::Done(_) => unreachable!(),
-                        }
-                        continue;
-                    } else {
-                        for ((_, _), state) in [first_interest].into_iter().chain(interests) {
-                            match state {
-                                KeyState::NeedsVectoredGet => {
-                                    *state = KeyState::Done(match &res {
-                                        Ok(buf) => Ok(buf.clone()),
-                                        // this `match` is working around the fact that we cannot Clone the PageReconstructError
-                                        Err(err) => Err(match err {
-                                            PageReconstructError::Cancelled => {
-                                                PageReconstructError::Cancelled
-                                            }
-
-                                            x @ PageReconstructError::Other(_) |
-                                            x @ PageReconstructError::AncestorLsnTimeout(_) |
-                                            x @ PageReconstructError::WalRedo(_) |
-                                            x @ PageReconstructError::MissingKey(_) => {
-                                                PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
-                                            },
-                                        }),
-                                    });
-                                }
-                                KeyState::Done(_) => unreachable!(),
-                            }
-                        }
-                    }
-                }
-            }
-            Err(err) => {
-                warn!(?err, "get_vectored failed with a global error, mapping that error to per-key failure");
-                // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
-                for ((_, _), state) in key_states.iter_mut() {
-                    // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
-                    // but without taking ownership of the GetVectoredError
-                    match &err {
-                        GetVectoredError::Cancelled => {
-                            *state = KeyState::Done(Err(PageReconstructError::Cancelled));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::MissingKey(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::GetReadyAncestorError(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))));
-                        }
-                        // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::Other(err) => {
-                            *state = KeyState::Done(Err(PageReconstructError::Other(
-                                anyhow::anyhow!("whole vectored get request failed: {err:?}"),
-                            )));
-                        }
-                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::InvalidLsn(e) => {
-                            *state =
-                                KeyState::Done(Err(anyhow::anyhow!("invalid LSN: {e:?}").into()));
-                        }
-                        // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
-                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::Oversized(err) => {
-                            *state = KeyState::Done(Err(anyhow::anyhow!(
-                                "batching oversized: {err:?}"
-                            )
-                            .into()));
-                        }
-                    }
-                }
-            }
-        };
-
-        // get the results into the order in which they were requested
-        let mut return_order: smallvec::SmallVec<[_; Timeline::MAX_GET_VECTORED_KEYS as usize]> =
-            smallvec::SmallVec::with_capacity(key_states.len());
-        return_order.extend(key_states.keys().map(|(key, idx)| (*key, *idx)));
-        return_order.sort_unstable_by_key(|(_, idx)| *idx);
-        let mut res = smallvec::SmallVec::with_capacity(key_states.len());
-        res.extend(return_order.into_iter().map(|key_states_key| {
-            match key_states.remove(&key_states_key).unwrap() {
-                KeyState::Done(res) => res,
-                KeyState::NeedsVectoredGet => unreachable!(),
-            }
-        }));
-        res
+        let key = rel_block_to_key(tag, blknum);
+        version.get(self, key, ctx).await
    }

    // Get size of a database in blocks
@@ -1184,10 +1021,9 @@ impl Timeline {
 }

 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository.
-///
-/// It is created by the 'begin_record' function. It is called for each WAL
-/// record, so that all the modifications by a one WAL record appear atomic.
+/// updates to the repository. It is created by the 'begin_record'
+/// function. It is called for each WAL record, so that all the modifications
+/// by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
@@ -2212,7 +2048,6 @@ impl<'a> DatadirModification<'a> {

 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
-///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -73,21 +73,6 @@ impl ValueBytes {

        Ok(raw[8] == 1)
    }
-
-    pub(crate) fn is_image(raw: &[u8]) -> Result<bool, InvalidInput> {
-        if raw.len() < 12 {
-            return Err(InvalidInput::TooShortValue);
-        }
-
-        let value_discriminator = &raw[0..4];
-
-        if value_discriminator == [0, 0, 0, 0] {
-            // Value::Image always initializes
-            return Ok(true);
-        }
-
-        Ok(false)
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,9 +1,8 @@
-//! Timeline repository implementation that keeps old data in layer files, and
-//! the recent changes in ephemeral files.
 //!
-//! See tenant/*_layer.rs files. The functions here are responsible for locating
-//! the correct layer for the get/put call, walking back the timeline branching
-//! history as needed.
+//! Timeline repository implementation that keeps old data in files on disk, and
+//! the recent changes in memory. See tenant/*_layer.rs files.
+//! The functions here are responsible for locating the correct layer for the
+//! get/put call, walking back the timeline branching history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,8 +1,7 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata.
-//!
-//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
-//! their timelines, this struct and its original serialization format is still needed because
-//! they were written a long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
+//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
+//! this struct and it's original serialization format is still needed because they were written a
+//! long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,10 +282,9 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

-/// Responsible for storing and mutating the collection of all tenants
-/// that this pageserver has state for.
-///
-/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
+/// The TenantManager is responsible for storing and mutating the collection of all tenants
+/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
+/// lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2347,9 +2346,8 @@ pub enum TenantMapError {
    ShuttingDown,
 }

-/// Guards a particular tenant_id's content in the TenantsMap.
-///
-/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.  While this
+/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,8 +2184,6 @@ pub fn remote_timeline_path(
    remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }

-/// Obtains the path of the given Layer in the remote
-///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,5 +1,4 @@
 //! In-memory index to track the tenant files on the remote storage.
-//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,17 +8,15 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-use tokio::sync::{self};
-use utils::bin_ser::BeSer;
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::{Value, ValueBytes};
+use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
-use pageserver_api::key::{Key, DBDIR_KEY};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use std::cmp::Ordering;
+use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
@@ -81,18 +79,12 @@ pub(crate) enum ValueReconstructSituation {
 }

 /// Reconstruct data accumulated for a single key during a vectored get
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct VectoredValueReconstructState {
-    pub(crate) records: Vec<(
-        Lsn,
-        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
-    )>,
-    pub(crate) img: Option<(
-        Lsn,
-        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
-    )>,
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,

-    pub(crate) situation: ValueReconstructSituation,
+    situation: ValueReconstructSituation,
 }

 impl VectoredValueReconstructState {
@@ -101,57 +93,16 @@ impl VectoredValueReconstructState {
    }
 }

-pub(crate) async fn convert(
-    _key: Key,
-    from: VectoredValueReconstructState,
-) -> Result<ValueReconstructState, PageReconstructError> {
-    let mut to = ValueReconstructState::default();
+impl From<VectoredValueReconstructState> for ValueReconstructState {
+    fn from(mut state: VectoredValueReconstructState) -> Self {
+        // walredo expects the records to be descending in terms of Lsn
+        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));

-    for (lsn, fut) in from.records {
-        match fut.await {
-            Ok(res) => match res {
-                Ok(bytes) => {
-                    let value = Value::des(&bytes)
-                        .map_err(|err| PageReconstructError::Other(err.into()))?;
-
-                    match value {
-                        Value::WalRecord(rec) => {
-                            to.records.push((lsn, rec));
-                        },
-                        Value::Image(img) => {
-                            assert!(to.img.is_none());
-                            to.img = Some((lsn, img));
-                        }
-                    }
-                }
-                Err(err) => {
-                    return Err(PageReconstructError::Other(err.into()));
-                }
-            },
-            Err(err) => {
-                return Err(PageReconstructError::Other(err.into()));
-            }
+        ValueReconstructState {
+            records: state.records,
+            img: state.img,
        }
    }
-
-    if to.img.is_none() {
-        let (lsn, fut) = from.img.expect("Need an image");
-        match fut.await {
-            Ok(res) => match res {
-                Ok(bytes) => {
-                    to.img = Some((lsn, bytes));
-                }
-                Err(err) => {
-                    return Err(PageReconstructError::Other(err.into()));
-                }
-            },
-            Err(err) => {
-                return Err(PageReconstructError::Other(err.into()));
-            }
-        }
-    }
-
-    Ok(to)
 }

 /// Bag of data accumulated during a vectored get..
@@ -249,8 +200,7 @@ impl ValuesReconstructState {
        &mut self,
        key: &Key,
        lsn: Lsn,
-        completes: bool,
-        value: sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
+        value: Value,
    ) -> ValueReconstructSituation {
        let state = self
            .keys
@@ -258,14 +208,31 @@ impl ValuesReconstructState {
            .or_insert(Ok(VectoredValueReconstructState::default()));

        if let Ok(state) = state {
-            match state.situation {
+            let key_done = match state.situation {
                ValueReconstructSituation::Complete => unreachable!(),
-                ValueReconstructSituation::Continue => {
-                    state.records.push((lsn, value));
-                }
-            }
+                ValueReconstructSituation::Continue => match value {
+                    Value::Image(img) => {
+                        state.img = Some((lsn, img));
+                        true
+                    }
+                    Value::WalRecord(rec) => {
+                        debug_assert!(
+                            Some(lsn) > state.get_cached_lsn(),
+                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
+                            lsn,
+                            state
+                                .get_cached_lsn()
+                                .expect("Assertion can only fire if a cached lsn is present")
+                        );

-            if completes && state.situation == ValueReconstructSituation::Continue {
+                        let will_init = rec.will_init();
+                        state.records.push((lsn, rec));
+                        will_init
+                    }
+                },
+            };
+
+            if key_done && state.situation == ValueReconstructSituation::Continue {
                state.situation = ValueReconstructSituation::Complete;
                self.keys_done.add_key(*key);
            }
@@ -467,11 +434,10 @@ impl ReadableLayer {
    }
 }

-/// Layers contain a hint indicating whether they are likely to be used for reads.
-///
-/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
-/// when changing the visibility of layers (for example when creating a branch that makes some previously
-/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
+/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
+/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
+/// be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,12 +42,13 @@ use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -57,14 +58,14 @@ use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::sync::{self, OnceCell};
+use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
 use tracing::*;

@@ -135,11 +136,10 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

-/// Struct representing reference to BLOB in layers.
-///
-/// Reference contains BLOB offset, and for WAL records it also contains
-/// `will_init` flag. The flag helps to determine the range of records
-/// that needs to be applied, without reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers. Reference contains BLOB
+/// offset, and for WAL records it also contains `will_init` flag. The flag
+/// helps to determine the range of records that needs to be applied, without
+/// reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -223,7 +223,7 @@ pub struct DeltaLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    file: Arc<VirtualFile>,
+    file: VirtualFile,
    file_id: FileId,

    layer_key_range: Range<Key>,
@@ -787,11 +787,9 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = Arc::new(
-            VirtualFile::open(path, ctx)
-                .await
-                .context("open layer file")?,
-        );
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;

        let file_id = page_cache::next_file_id();

@@ -981,59 +979,77 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
        let max_vectored_read_bytes = self
            .max_vectored_read_bytes
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
+        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
        // track when a key is done.
        for read in reads.into_iter().rev() {
-            let mut senders: HashMap<
-                (Key, Lsn),
-                sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
-            > = Default::default();
-            for (_, blob_meta) in read.blobs_at.as_slice() {
-                let (tx, rx) = sync::oneshot::channel();
-                senders.insert((blob_meta.key, blob_meta.lsn), tx);
-                reconstruct_state.update_key(
-                    &blob_meta.key,
-                    blob_meta.lsn,
-                    blob_meta.will_init,
-                    rx,
-                );
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::Other(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(buf_size));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
            }

-            let read_from = self.file.clone();
-            let read_ctx = ctx.attached_child();
-            tokio::task::spawn(async move {
-                let vectored_blob_reader = VectoredBlobReader::new(&read_from);
-                let buf = BytesMut::with_capacity(buf_size);
-
-                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
-                match res {
-                    Ok(blobs_buf) => {
-                        for meta in blobs_buf.blobs.iter().rev() {
-                            let buf = &blobs_buf.buf[meta.start..meta.end];
-                            let sender = senders
-                                .remove(&(meta.meta.key, meta.meta.lsn))
-                                .expect("sender must exist");
-                            let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
-                        }
-
-                        assert!(senders.is_empty());
-                    }
-                    Err(err) => {
-                        for (_, sender) in senders {
-                            let _ = sender
-                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
-                        }
-                    }
-                }
-            });
+            buf = Some(blobs_buf.buf);
        }
    }

@@ -1173,14 +1189,7 @@ impl DeltaLayerInner {
            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                let end_offset = offset;

-                Some((
-                    BlobMeta {
-                        key,
-                        lsn,
-                        will_init: false,
-                    },
-                    start_offset..end_offset,
-                ))
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
            } else {
                None
            };
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,9 +1,7 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN.
-//!
-//! It contains an image of all key-value pairs in its key-range. Any key
-//! that falls into the image layer's range but does not exist in the layer,
-//! does not exist.
+//! one particular LSN. It contains an image of all key-value pairs
+//! in its key-range. Any key that falls into the image layer's range
+//! but does not exist in the layer, does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
@@ -21,7 +19,7 @@
 //!
 //! Every image layer file consists of three parts: "summary",
 //! "index", and "values".  The summary is a fixed size header at the
-//! beginningof the file, and it contains basic information about the
+//! beginning of the file, and it contains basic information about the
 //! layer, and offsets to the other parts. The "index" is a B-tree,
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
@@ -38,11 +36,11 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::Timeline;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
@@ -52,14 +50,13 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::sync::oneshot;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -164,7 +161,7 @@ pub struct ImageLayerInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    file: Arc<VirtualFile>,
+    file: VirtualFile,
    file_id: FileId,

    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -391,11 +388,9 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = Arc::new(
-            VirtualFile::open(path, ctx)
-                .await
-                .context("open layer file")?,
-        );
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
        let block_reader = FileBlockReader::new(&file, file_id);
        let summary_blk = block_reader
@@ -582,16 +577,8 @@ impl ImageLayerInner {
            .0
            .into();

+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let mut senders: HashMap<(Key, Lsn), oneshot::Sender<Result<Bytes, std::io::Error>>> =
-                Default::default();
-            for (_, blob_meta) in read.blobs_at.as_slice() {
-                let (tx, rx) = oneshot::channel();
-                senders.insert((blob_meta.key, blob_meta.lsn), tx);
-
-                reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true, rx);
-            }
-
            let buf_size = read.size();

            if buf_size > max_vectored_read_bytes {
@@ -610,33 +597,36 @@ impl ImageLayerInner {
                );
            }

-            let read_from = self.file.clone();
-            let read_ctx = ctx.attached_child();
-            tokio::task::spawn(async move {
-                let buf = BytesMut::with_capacity(buf_size);
-                let vectored_blob_reader = VectoredBlobReader::new(&*read_from);
-                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
+            let buf = BytesMut::with_capacity(buf_size);
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

-                match res {
-                    Ok(blobs_buf) => {
-                        for meta in blobs_buf.blobs.iter().rev() {
-                            let buf = &blobs_buf.buf[meta.start..meta.end];
-                            let sender = senders
-                                .remove(&(meta.meta.key, meta.meta.lsn))
-                                .expect("sender must exist");
-                            let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
-                        }
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();

-                        assert!(senders.is_empty());
-                    }
-                    Err(err) => {
-                        for (_, sender) in senders {
-                            let _ = sender
-                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
-                        }
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
                    }
                }
-            });
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
        }
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,9 +10,10 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -34,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+};

 pub(crate) mod vectored_dio_read;

@@ -84,7 +87,7 @@ pub struct InMemoryLayerInner {
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
-    file: Arc<tokio::sync::RwLock<EphemeralFile>>,
+    file: EphemeralFile,

    resource_units: GlobalResourceUnits,
 }
@@ -378,11 +381,7 @@ impl InMemoryLayer {
    }

    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner
-            .try_read()
-            .map(|i| i.file.try_read().map(|i| i.len()).ok())
-            .ok()
-            .flatten()
+        self.inner.try_read().map(|i| i.file.len()).ok()
    }

    pub(crate) fn assert_writable(&self) {
@@ -433,10 +432,6 @@ impl InMemoryLayer {
            read: vectored_dio_read::LogicalRead<Vec<u8>>,
        }
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
-        let mut senders: HashMap<
-            (Key, Lsn),
-            tokio::sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
-        > = Default::default();

        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
@@ -464,11 +459,6 @@ impl InMemoryLayer {
                            Vec::with_capacity(len as usize),
                        ),
                    });
-
-                    let (tx, rx) = tokio::sync::oneshot::channel();
-                    senders.insert((key, *entry_lsn), tx);
-                    reconstruct_state.update_key(&key, *entry_lsn, will_init, rx);
-
                    if will_init {
                        break;
                    }
@@ -476,42 +466,46 @@ impl InMemoryLayer {
            }
        }

-        let read_from = inner.file.clone();
-        let read_ctx = ctx.attached_child();
-        tokio::task::spawn(async move {
-            let locked = read_from.read().await;
-            let f = vectored_dio_read::execute(
-                &*locked,
-                reads
-                    .iter()
-                    .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-                &read_ctx,
-            );
-            send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-                .await;
+        // Execute the reads.

-            for (key, value_reads) in reads {
-                for ValueRead { entry_lsn, read } in value_reads {
-                    let sender = senders
-                        .remove(&(key, entry_lsn))
-                        .expect("sender must exist");
-                    match read.into_result().expect("we run execute() above") {
-                        Err(e) => {
-                            let sender = senders
-                                .remove(&(key, entry_lsn))
-                                .expect("sender must exist");
-                            let _ = sender
-                                .send(Err(std::io::Error::new(e.kind(), "dio vec read failed")));
+        let f = vectored_dio_read::execute(
+            &inner.file,
+            reads
+                .iter()
+                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+            &ctx,
+        );
+        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+            .await;
+
+        // Process results into the reconstruct state
+        'next_key: for (key, value_reads) in reads {
+            for ValueRead { entry_lsn, read } in value_reads {
+                match read.into_result().expect("we run execute() above") {
+                    Err(e) => {
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        continue 'next_key;
+                    }
+                    Ok(value_buf) => {
+                        let value = Value::des(&value_buf);
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            continue 'next_key;
                        }
-                        Ok(value_buf) => {
-                            let _ = sender.send(Ok(value_buf.into()));
+
+                        let key_situation =
+                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            // TODO: metric to see if we fetched more values than necessary
+                            continue 'next_key;
                        }
+
+                        // process the next value in the next iteration of the loop
                    }
                }
            }
-
-            assert!(senders.is_empty());
-        });
+        }

        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);

@@ -606,8 +600,7 @@ impl InMemoryLayer {
    /// Get layer size.
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        let locked = inner.file.try_read().expect("no contention");
-        Ok(locked.len())
+        Ok(inner.file.len())
    }

    /// Create a new, empty, in-memory layer
@@ -621,10 +614,9 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = Arc::new(tokio::sync::RwLock::new(
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?,
-        ));
-        let key = InMemoryLayerFileId(file.read().await.page_cache_file_id());
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
@@ -656,7 +648,7 @@ impl InMemoryLayer {
        let mut inner = self.inner.write().await;
        self.assert_writable();

-        let base_offset = inner.file.read().await.len();
+        let base_offset = inner.file.len();

        let SerializedBatch {
            raw,
@@ -680,13 +672,8 @@ impl InMemoryLayer {
        }

        // Write the batch to the file
-        // FIXME: can't borrow arc
-        let new_size = {
-            let mut locked = inner.file.write().await;
-            locked.write_raw(&raw, ctx).await?;
-            locked.len()
-        };
-
+        inner.file.write_raw(&raw, ctx).await?;
+        let new_size = inner.file.len();
        let expected_new_len = base_offset
            .checked_add(raw.len().into_u64())
            // write_raw would error if we were to overflow u64.
@@ -726,7 +713,7 @@ impl InMemoryLayer {

    pub(crate) async fn tick(&self) -> Option<u64> {
        let mut inner = self.inner.write().await;
-        let size = inner.file.read().await.len();
+        let size = inner.file.len();
        inner.resource_units.publish_size(size)
    }

@@ -822,7 +809,7 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.read().await.load_to_vec(ctx).await?;
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;

                let file_contents = Bytes::from(file_contents);

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -107,8 +107,6 @@ async fn smoke_test() {
            .expect("tenant harness writes the control file")
    };

-    let img_before = (img_before.0, img_before.1.await.unwrap().unwrap());
-    let img_after = (img_after.0, img_after.1.await.unwrap().unwrap());
    assert_eq!(img_before, img_after);

    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,10 +12,8 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;

-/// A unique identifier of a persistent layer.
-///
-/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
-/// This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,9 +217,8 @@ impl fmt::Display for ImageLayerName {
    }
 }

-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
-///
-/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
+/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,11 +226,9 @@ impl<'a> IteratorWrapper<'a> {
    }
 }

-/// A merge iterator over delta/image layer iterators.
-///
-/// When duplicated records are found, the iterator will not perform any
-/// deduplication, and the caller should handle these situation. By saying
-/// duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,10 +34,9 @@ impl SplitWriterResult {
    }
 }

-/// An image writer that takes images and produces multiple image layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the image layer generation
-/// fails, there might be leftover files to be cleaned up)
+/// An image writer that takes images and produces multiple image layers. The interface does not
+/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
+/// to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
@@ -194,10 +193,9 @@ impl SplitImageLayerWriter {
    }
 }

-/// A delta writer that takes key-lsn-values and produces multiple delta layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
-/// there might be leftover files to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,7 +18,6 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
@@ -69,9 +68,7 @@ use crate::{
    tenant::{
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::{
-            convert, inmemory_layer::IndexEntry, PersistentLayerDesc, ValueReconstructSituation,
-        },
+        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
    },
    walredo,
 };
@@ -1132,38 +1129,22 @@ impl Timeline {
        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
            .for_get_kind(get_kind)
            .start_timer();
+        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
        let layers_visited = reconstruct_state.get_layers_visited();

-        let futs = FuturesUnordered::new();
        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
-            futs.push({
-                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
-                async move {
-                    let state = res.expect("Read path is infallible");
-                    assert!(matches!(
-                        state.situation,
-                        ValueReconstructSituation::Complete
-                    ));
-
-                    let converted = match convert(key, state).await {
-                        Ok(ok) => ok,
-                        Err(err) => {
-                            return (key, Err(err));
-                        }
-                    };
-
-                    (
-                        key,
-                        walredo_self.reconstruct_value(key, lsn, converted).await,
-                    )
+            match res {
+                Err(err) => {
+                    results.insert(key, Err(err));
                }
-            });
+                Ok(state) => {
+                    let state = ValueReconstructState::from(state);
+
+                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
+                    results.insert(key, reconstruct_res);
+                }
+            }
        }
-
-        let results = futs
-            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
-            .await;
-
        reconstruct_timer.stop_and_record();

        // For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -5515,30 +5496,30 @@ impl Timeline {
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
        self: &Arc<Timeline>,
-        _lsn: Lsn,
-        _ctx: &RequestContext,
+        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
-        // let mut all_data = Vec::new();
-        // let guard = self.layers.read().await;
-        // for layer in guard.layer_map()?.iter_historic_layers() {
-        //     if !layer.is_delta() && layer.image_layer_lsn() == lsn {
-        //         let layer = guard.get_from_desc(&layer);
-        //         let mut reconstruct_data = ValuesReconstructState::default();
-        //         layer
-        //             .get_values_reconstruct_data(
-        //                 KeySpace::single(Key::MIN..Key::MAX),
-        //                 lsn..Lsn(lsn.0 + 1),
-        //                 &mut reconstruct_data,
-        //                 ctx,
-        //             )
-        //             .await?;
-        //         for (k, v) in reconstruct_data.keys {
-        //             all_data.push((k, v?.img.unwrap().1));
-        //         }
-        //     }
-        // }
-        // all_data.sort();
-        Ok(Vec::new())
+        let mut all_data = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map()?.iter_historic_layers() {
+            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+                let layer = guard.get_from_desc(&layer);
+                let mut reconstruct_data = ValuesReconstructState::default();
+                layer
+                    .get_values_reconstruct_data(
+                        KeySpace::single(Key::MIN..Key::MAX),
+                        lsn..Lsn(lsn.0 + 1),
+                        &mut reconstruct_data,
+                        ctx,
+                    )
+                    .await?;
+                for (k, v) in reconstruct_data.keys {
+                    all_data.push((k, v?.img.unwrap().1));
+                }
+            }
+        }
+        all_data.sort();
+        Ok(all_data)
    }

    /// Get all historic layer descriptors in the layer map
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -33,7 +33,6 @@ use crate::virtual_file::{self, VirtualFile};
 pub struct BlobMeta {
    pub key: Key,
    pub lsn: Lsn,
-    pub will_init: bool,
 }

 /// Blob offsets into [`VectoredBlobsBuf::buf`]
@@ -356,8 +355,7 @@ pub enum BlobFlag {
 /// * Iterate over the collected blobs and coalesce them into reads at the end
 pub struct VectoredReadPlanner {
    // Track all the blob offsets. Start offsets must be ordered.
-    // Note: last bool is will_init
-    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

@@ -422,12 +420,12 @@ impl VectoredReadPlanner {
        match flag {
            BlobFlag::None => {
                let blobs_for_key = self.blobs.entry(key).or_default();
-                blobs_for_key.push((lsn, start_offset, end_offset, false));
+                blobs_for_key.push((lsn, start_offset, end_offset));
            }
            BlobFlag::ReplaceAll => {
                let blobs_for_key = self.blobs.entry(key).or_default();
                blobs_for_key.clear();
-                blobs_for_key.push((lsn, start_offset, end_offset, true));
+                blobs_for_key.push((lsn, start_offset, end_offset));
            }
            BlobFlag::Ignore => {}
        }
@@ -438,17 +436,11 @@ impl VectoredReadPlanner {
        let mut reads = Vec::new();

        for (key, blobs_for_key) in self.blobs {
-            for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
                let extended = match &mut current_read_builder {
-                    Some(read_builder) => read_builder.extend(
-                        start_offset,
-                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init,
-                        },
-                    ),
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
                    None => VectoredReadExtended::No,
                };

@@ -456,11 +448,7 @@ impl VectoredReadPlanner {
                    let next_read_builder = VectoredReadBuilder::new(
                        start_offset,
                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init,
-                        },
+                        BlobMeta { key, lsn },
                        self.max_read_size,
                        self.mode,
                    );
@@ -605,10 +593,8 @@ impl<'a> VectoredBlobReader<'a> {
    }
 }

-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
-///
-/// It provides a streaming API for getting read blobs. It returns a batch when
-/// `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
    read_builder: Option<VectoredReadBuilder>,
@@ -677,19 +663,10 @@ impl StreamingVectoredReadPlanner {
        start_offset: u64,
        end_offset: u64,
        is_last_blob_in_read: bool,
-        // destination: oneshot::Sender<Result<Bytes, std::io::Error>>,
    ) -> Option<VectoredRead> {
        match &mut self.read_builder {
            Some(read_builder) => {
-                let extended = read_builder.extend(
-                    start_offset,
-                    end_offset,
-                    BlobMeta {
-                        key,
-                        lsn,
-                        will_init: false,
-                    },
-                );
+                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
                assert_eq!(extended, VectoredReadExtended::Yes);
            }
            None => {
@@ -697,11 +674,7 @@ impl StreamingVectoredReadPlanner {
                    Some(VectoredReadBuilder::new_streaming(
                        start_offset,
                        end_offset,
-                        BlobMeta {
-                            key,
-                            lsn,
-                            will_init: false,
-                        },
+                        BlobMeta { key, lsn },
                        self.mode,
                    ))
                };
@@ -1033,7 +1006,6 @@ mod tests {
        let meta = BlobMeta {
            key: Key::MIN,
            lsn: Lsn(0),
-            will_init: false,
        };

        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,7 +1,6 @@
-//! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor.
 //!
-//! Instead, the file is opened when it's read from,
+//! VirtualFile is like a normal File, but it's not bound directly to
+//! a file descriptor. Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,12 +43,13 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;

-/// The real implementation that uses a Postgres process to
-/// perform WAL replay.
 ///
-/// Only one thread can use the process at a time, that is controlled by the
-/// Mutex. In the future, we might want to launch a pool of processes to allow
-/// concurrent replay of multiple records.
+/// This is the real implementation that uses a Postgres process to
+/// perform WAL replay. Only one thread can use the process at a time,
+/// that is controlled by the Mutex. In the future, we might want to
+/// launch a pool of processes to allow concurrent replay of multiple
+/// records.
+///
 pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,12 +1038,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if last_log_term on the node which gave
-			 * the highest vote (i.e. point where we are going to start writing)
-			 * actually had been won by me; plain restart of walproposer not
-			 * intervened by concurrent compute which wrote WAL is ok.
-			 *
-			 * This avoids compute crash after manual term_bump.
+			 * However, allow to proceed if previously elected leader was me;
+			 * plain restart of walproposer not intervened by concurrent
+			 * compute (who could generate WAL) is ok.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1445,17 +1442,12 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 *
-			 * Term has changed to higher one, probably another compute is
-			 * running. If this is the case we could PANIC as well because
-			 * likely it inserted some data and our basebackup is unsuitable
-			 * anymore. However, we also bump term manually (term_bump endpoint)
-			 * on safekeepers for migration purposes, in this case we do want
-			 * compute to stay alive. So restart walproposer with FATAL instead
-			 * of panicking; if basebackup is spoiled next election will notice
-			 * this.
+			 * Another compute with higher term is running. Panic to restart
+			 * PG as we likely need to retake basebackup. However, don't dump
+			 * core as this is kinda expected scenario.
 			 */
-			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			disable_core_dump();
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -16,7 +16,7 @@ use tracing::debug;
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};

-use super::{common::Cached, timed_lru, Cache};
+use super::{common::Cached, *};

 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -44,14 +44,16 @@
    clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
 // TODO: fix code and reduce list or move to permanent list above.
-#![expect(
+#![allow(
    clippy::cargo_common_metadata,
    clippy::cast_possible_truncation,
    clippy::cast_possible_wrap,
    clippy::cast_precision_loss,
    clippy::cast_sign_loss,
    clippy::doc_markdown,
+    clippy::implicit_hasher,
    clippy::inline_always,
    clippy::match_same_arms,
    clippy::match_wild_err_arm,
@@ -59,28 +61,21 @@
    clippy::missing_panics_doc,
    clippy::module_name_repetitions,
    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
    clippy::redundant_closure_for_method_calls,
+    clippy::return_self_not_must_use,
    clippy::similar_names,
    clippy::single_match_else,
    clippy::struct_excessive_bools,
    clippy::struct_field_names,
    clippy::too_many_lines,
-    clippy::unused_self
-)]
-#![cfg_attr(
-    any(test, feature = "testing"),
-    allow(
-        clippy::needless_raw_string_hashes,
-        clippy::unreadable_literal,
-        clippy::unused_async,
-    )
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::wildcard_imports
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(
-    unknown_lints,
-    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
-    impl_trait_overcaptures,
-)]
+#![allow(unknown_lints, clippy::manual_inspect)]

 use std::{convert::Infallible, future::Future};

--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -217,7 +217,6 @@ impl sasl::Mechanism for Exchange<'_> {
                        self.state = ExchangeState::SaltSent(sent);
                        Ok(Step::Continue(self, msg))
                    }
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                    Step::Success(x, _) => match x {},
                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
@@ -225,7 +224,6 @@ impl sasl::Mechanism for Exchange<'_> {
            ExchangeState::SaltSent(sent) => {
                match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                    Step::Continue(x, _) => match x {},
                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -745,20 +745,22 @@ impl BatchQueryData {
            builder = builder.deferrable(true);
        }

-        let transaction = builder.start().await.inspect_err(|_| {
+        let transaction = builder.start().await.map_err(|e| {
            // if we cannot start a transaction, we should return immediately
            // and not return to the pool. connection is clearly broken
            discard.discard();
+            e
        })?;

        let json_output =
            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                Ok(json_output) => {
                    info!("commit");
-                    let status = transaction.commit().await.inspect_err(|_| {
+                    let status = transaction.commit().await.map_err(|e| {
                        // if we cannot commit - for now don't return connection to pool
                        // TODO: get a query status from the error
                        discard.discard();
+                        e
                    })?;
                    discard.check_idle(status);
                    json_output
@@ -774,10 +776,11 @@ impl BatchQueryData {
                }
                Err(err) => {
                    info!("rollback");
-                    let status = transaction.rollback().await.inspect_err(|_| {
+                    let status = transaction.rollback().await.map_err(|e| {
                        // if we cannot rollback - for now don't return connection to pool
                        // TODO: get a query status from the error
                        discard.discard();
+                        e
                    })?;
                    discard.check_idle(status);
                    return Err(err);
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,7 +14,6 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;

 /// Stream wrapper which implements libpq's protocol.
-///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -3,5 +3,5 @@ channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
-# but we also need `llvm-tools` for coverage data merges on CI
-components = ["llvm-tools", "rustfmt", "clippy"]
+# but we also need `llvm-tools-preview` for coverage data merges on CI
+components = ["llvm-tools-preview", "rustfmt", "clippy"]
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,9 +1,6 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-/// If tenant_id is provided, allow if token (claims) is for this tenant or
-/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
-/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
        (Scope::Tenant, None) => Err(AuthError(
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use storage_broker::Uri;

 use tracing::*;
@@ -261,15 +261,6 @@ async fn main() -> anyhow::Result<()> {
    // Change into the data directory.
    std::env::set_current_dir(&workdir)?;

-    // Prevent running multiple safekeepers on the same directory
-    let lock_file_path = workdir.join(PID_FILE_NAME);
-    let lock_file =
-        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("claimed pid file at {lock_file_path:?}");
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    std::mem::forget(lock_file);
-
    // Set or read our ID.
    let id = set_id(&workdir, args.id.map(NodeId))?;
    if args.init {
@@ -373,15 +364,15 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;

 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
-    // fsync the datadir to make sure we have a consistent state on disk.
-    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
-    let started = Instant::now();
-    utils::crashsafe::syncfs(dfd)?;
-    let elapsed = started.elapsed();
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "syncfs data directory done"
-    );
+    // Prevent running multiple safekeepers on the same directory
+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("claimed pid file at {lock_file_path:?}");
+
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    std::mem::forget(lock_file);

    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;

 use postgres_ffi::WAL_SEGMENT_SIZE;
+use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
-use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
    auth::SwappableJwtAuth,
    http::{
@@ -408,28 +408,6 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
    json_response(StatusCode::OK, response)
 }

-/// Make term at least as high as one in request. If one in request is None,
-/// increment current one.
-async fn timeline_term_bump_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-    check_permission(&request, Some(ttid.tenant_id))?;
-
-    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
-
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let response = tli
-        .term_bump(request_data.term)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -652,10 +630,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
            |r| request_span(r, timeline_backup_partial_reset),
        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
-            |r| request_span(r, timeline_term_bump_handler),
-        )
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
        })
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,7 +484,6 @@ pub async fn validate_temp_timeline(
 }

 /// Move timeline from a temp directory to the main storage, and load it to the global map.
-///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,10 +448,8 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);

 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx.
-///
-/// Reading from socket and writing to disk in parallel is beneficial for
-/// performance, this struct provides the writing to disk part.
+/// replies to reply_tx; reading from socket and writing to disk in parallel is
+/// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
    tli: WalResidentTimeline,
    msg_rx: Receiver<ProposerAcceptorMessage>,
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -938,9 +938,8 @@ where
        }

        trace!(
-            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
+            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
            msg.wal_data.len(),
-            msg.h.begin_lsn,
            msg.h.end_lsn,
            msg.h.commit_lsn,
            msg.h.truncate_lsn,
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,10 +1,9 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).

-use std::{cmp::max, ops::Deref};
+use std::ops::Deref;

 use anyhow::Result;
-use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -13,7 +12,7 @@ use utils::{

 use crate::{
    control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
    wal_backup_partial::{self},
 };

@@ -148,11 +147,9 @@ pub struct TimelineMemState {
    pub proposer_uuid: PgUuid,
 }

-/// Safekeeper persistent state plus in memory layer.
-///
-/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
-/// which don't need immediate persistence. Provides transactional like API
-/// to atomically update the state.
+/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
+/// when we update fields like commit_lsn which don't need immediate
+/// persistence. Provides transactional like API to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
@@ -212,27 +209,6 @@ where
        let s = self.start_change();
        self.finish_change(&s).await
    }
-
-    /// Make term at least as `to`. If `to` is None, increment current one. This
-    /// is not in safekeeper.rs because we want to be able to do it even if
-    /// timeline is offloaded.
-    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        let before = self.acceptor_state.term;
-        let mut state = self.start_change();
-        let new = match to {
-            Some(to) => max(state.acceptor_state.term, to),
-            None => state.acceptor_state.term + 1,
-        };
-        if new > state.acceptor_state.term {
-            state.acceptor_state.term = new;
-            self.finish_change(&state).await?;
-        }
-        let after = self.acceptor_state.term;
-        Ok(TimelineTermBumpResponse {
-            previous_term: before,
-            current_term: after,
-        })
-    }
 }

 impl<CTRL> Deref for TimelineState<CTRL>
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,6 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
-use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -170,7 +169,6 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }

 /// This structure is stored in shared state and represents the state of the timeline.
-///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
@@ -216,10 +214,6 @@ impl StateSK {
            .get_last_log_term(self.flush_lsn())
    }

-    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        self.state_mut().term_bump(to).await
-    }
-
    /// Close open WAL files to release FDs.
    fn close_wal_store(&mut self) {
        if let StateSK::Loaded(sk) = self {
@@ -859,11 +853,6 @@ impl Timeline {
        Ok(res)
    }

-    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
-        let mut state = self.write_shared_state().await;
-        state.sk.term_bump(to).await
-    }
-
    /// Get the timeline guard for reading/writing WAL files.
    /// If WAL files are not present on disk (evicted), they will be automatically
    /// downloaded from remote storage. This is done in the manager task, which is
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,8 +1,6 @@
-//! Code related to evicting WAL files to remote storage.
-//!
-//! The actual upload is done by the partial WAL backup code. This file has
-//! code to delete and re-download WAL files, cross-validate with partial WAL
-//! backup if local file is still present.
+//! Code related to evicting WAL files to remote storage. The actual upload is done by the
+//! partial WAL backup code. This file has code to delete and re-download WAL files,
+//! cross-validate with partial WAL backup if local file is still present.

 use anyhow::Context;
 use camino::Utf8PathBuf;
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,6 +1,4 @@
-//! Timeline residence guard
-//!
-//! It is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.

--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,5 +1,4 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
-//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,8 +60,7 @@ impl TimelinesSet {
    }
 }

-/// Guard is used to add or remove timelines from the set.
-///
+/// Guard is used to add or remove timeline from the set.
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,8 +1,6 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates.
-//!
-//! After the partial segment was updated (`flush_lsn` was changed), the segment
-//! will be uploaded to S3 within the configured `partial_backup_timeout`.
+//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
+//! was changed), the segment will be uploaded to S3 in about 15 minutes.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,7 +17,6 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -98,19 +98,7 @@ pub struct PhysicalStorage {
    /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
    write_lsn: Lsn,

-    /// The LSN of the last WAL record written to disk. Still can be not fully
-    /// flushed.
-    ///
-    /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
-    /// switch ingest the reverse is true because we don't bump write_lsn up to
-    /// the next segment: WAL stream from the compute doesn't have the gap and
-    /// for simplicity / as a sanity check we disallow any non-sequential
-    /// writes, so write zeros as is.
-    ///
-    /// Similar effect is in theory possible due to LSN alignment: if record
-    /// ends at *2, decoder will report end lsn as *8 even though we haven't
-    /// written these zeros yet. In practice compute likely never sends
-    /// non-aligned chunks of data.
+    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
    write_record_lsn: Lsn,

    /// The LSN of the last WAL record flushed to disk.
@@ -179,7 +167,8 @@ impl PhysicalStorage {
            )
        };

-        // note: this assumes we fsync'ed whole datadir on start.
+        // TODO: do we really know that write_lsn is fully flushed to disk?
+        //      If not, maybe it's better to call fsync() here to be sure?
        let flush_lsn = write_lsn;

        debug!(
@@ -451,12 +440,11 @@ impl Storage for PhysicalStorage {
            .with_label_values(&["truncate_wal"])
            .start_timer();

-        // Streaming must not create a hole, so truncate cannot be called on
-        // non-written lsn.
-        if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
+        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
+        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
            bail!(
-                "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
-                self.write_record_lsn,
+                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
+                self.write_lsn,
                end_pos
            );
        }
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -134,7 +134,7 @@ class LLVM:
            # Show a user-friendly warning
            raise Exception(' '.join([
                f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools`,",
+                "Please execute `rustup component add llvm-tools-preview`,",
                "or install it via your package manager of choice.",
                "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
            ]))
@@ -518,7 +518,7 @@ def main() -> None:
    example = f"""
 prerequisites:
    # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools
+    rustup component add llvm-tools-preview

 self-contained example:
    {app} run make
--- a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -1 +0,0 @@
-ALTER TABLE tenant_shards DROP preferred_az_id;
--- a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -1 +0,0 @@
-ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,14 +14,14 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    ShardsPreferredAzsRequest, TenantCreateRequest,
+    TenantCreateRequest,
 };
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::{mgmt_api, BlockUnblock};
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -369,23 +369,6 @@ async fn handle_tenant_timeline_detach_ancestor(
    json_response(StatusCode::OK, res)
 }

-async fn handle_tenant_timeline_block_unblock_gc(
-    service: Arc<Service>,
-    req: Request<Body>,
-    dir: BlockUnblock,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
-    service
-        .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handle_tenant_timeline_passthrough(
    service: Arc<Service>,
    req: Request<Body>,
@@ -556,17 +539,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, node_status)
 }

-async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    let node_status = state.service.get_node_shards(node_id).await?;
-
-    json_response(StatusCode::OK, node_status)
-}
-
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -716,18 +688,6 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
    )
 }

-async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state.service.update_shards_preferred_azs(azs_req).await?,
-    )
-}
-
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -1137,13 +1097,6 @@ pub fn make_router(
        .get("/control/v1/node/:node_id", |r| {
            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
        })
-        .get("/control/v1/node/:node_id/shards", |r| {
-            named_request_span(
-                r,
-                handle_node_shards,
-                RequestName("control_v1_node_describe"),
-            )
-        })
        .get("/control/v1/leader", |r| {
            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
        })
@@ -1221,13 +1174,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_policy"),
            )
        })
-        .put("/control/v1/preferred_azs", |r| {
-            named_request_span(
-                r,
-                handle_update_preferred_azs,
-                RequestName("control_v1_preferred_azs"),
-            )
-        })
        .put("/control/v1/step_down", |r| {
            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
        })
@@ -1309,26 +1255,6 @@ pub fn make_router(
                )
            },
        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
-                    RequestName("v1_tenant_timeline_block_unblock_gc"),
-                )
-            },
-        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
-                    RequestName("v1_tenant_timeline_block_unblock_gc"),
-                )
-            },
-        )
        // Tenant detail GET passthrough to shard zero:
        .get("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -7,10 +7,7 @@ use pageserver_api::{
    },
    shard::TenantShardId,
 };
-use pageserver_client::{
-    mgmt_api::{Client, Result},
-    BlockUnblock,
-};
+use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -261,24 +258,6 @@ impl PageserverClient {
        )
    }

-    pub(crate) async fn timeline_block_unblock_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<()> {
-        // measuring these makes no sense because we synchronize with the gc loop and remote
-        // storage on block_gc so there should be huge outliers
-        measured_request!(
-            "timeline_block_unblock_gc",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner
-                .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
-                .await
-        )
-    }
-
    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
        measured_request!(
            "utilization",
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealthOutdated,
    GetLeader,
    UpdateLeader,
-    SetPreferredAzs,
 }

 #[must_use]
@@ -665,33 +664,6 @@ impl Persistence {
        Ok(())
    }

-    pub(crate) async fn set_tenant_shard_preferred_azs(
-        &self,
-        preferred_azs: Vec<(TenantShardId, String)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
-
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az))
-                    .execute(conn)?;
-
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
-                }
-            }
-
-            Ok(shards_updated)
-        })
-        .await
-    }
-
    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1078,11 +1050,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) config: String,
    #[serde(default)]
    pub(crate) scheduling_policy: String,
-
-    // Hint that we should attempt to schedule this tenant shard the given
-    // availability zone in order to minimise the chances of cross-AZ communication
-    // with compute.
-    pub(crate) preferred_az_id: Option<String>,
 }

 impl TenantShardPersistence {
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -41,7 +41,6 @@ diesel::table! {
        splitting -> Int2,
        config -> Text,
        scheduling_policy -> Varchar,
-        preferred_az_id -> Nullable<Varchar>,
    }
 }

--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -25,7 +25,7 @@ use crate::{
        ShardGenerationState, TenantFilter,
    },
    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
        ScheduleOptimizationAction,
@@ -41,8 +41,7 @@ use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
        TenantShardMigrateRequest, TenantShardMigrateResponse,
@@ -69,7 +68,7 @@ use pageserver_api::{
        ValidateResponse, ValidateResponseTenant,
    },
 };
-use pageserver_client::{mgmt_api, BlockUnblock};
+use pageserver_client::mgmt_api;
 use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -117,9 +116,7 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);

 /// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline.
-///
-/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
 /// handling of the re-attach response may take a long time and blocks heartbeats from
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
@@ -142,7 +139,6 @@ enum TenantOperations {
    AttachHook,
    TimelineArchivalConfig,
    TimelineDetachAncestor,
-    TimelineGcBlockUnblock,
 }

 #[derive(Clone, strum_macros::Display)]
@@ -357,12 +353,6 @@ impl From<DatabaseError> for ApiError {
    }
 }

-enum InitialShardScheduleOutcome {
-    Scheduled(TenantCreateResponseShard),
-    NotScheduled,
-    ShardScheduleError(ScheduleError),
-}
-
 pub struct Service {
    inner: Arc<std::sync::RwLock<ServiceState>>,
    config: Config,
@@ -452,7 +442,7 @@ struct ShardSplitParams {
 // When preparing for a shard split, we may either choose to proceed with the split,
 // or find that the work is already done and return NoOp.
 enum ShardSplitAction {
-    Split(Box<ShardSplitParams>),
+    Split(ShardSplitParams),
    NoOp(TenantShardSplitResponse),
 }

@@ -1462,7 +1452,6 @@ impl Service {
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
-                preferred_az_id: None,
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -2034,7 +2023,6 @@ impl Service {
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
-                preferred_az_id: None,
            })
            .collect();

@@ -2058,87 +2046,99 @@ impl Service {
        };

        let mut schedule_context = ScheduleContext::default();
-        let mut schedule_error = None;
-        let mut response_shards = Vec::new();
-        for tenant_shard_id in create_ids {
-            tracing::info!("Creating shard {tenant_shard_id}...");

-            let outcome = self
-                .do_initial_shard_scheduling(
-                    tenant_shard_id,
-                    initial_generation,
-                    &create_req.shard_parameters,
-                    create_req.config.clone(),
-                    placement_policy.clone(),
-                    &mut schedule_context,
-                )
-                .await;
-
-            match outcome {
-                InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
-                InitialShardScheduleOutcome::NotScheduled => {}
-                InitialShardScheduleOutcome::ShardScheduleError(err) => {
-                    schedule_error = Some(err);
-                }
-            }
-        }
-
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            response_shards
-                .iter()
-                .filter_map(|resp| {
-                    let az_id = locked
-                        .nodes
-                        .get(&resp.node_id)
-                        .map(|n| n.get_availability_zone_id().to_string())?;
-
-                    Some((resp.shard_id, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        // Note that we persist the preferred AZ for the new shards separately.
-        // In theory, we could "peek" the scheduler to determine where the shard will
-        // land, but the subsequent "real" call into the scheduler might select a different
-        // node. Hence, we do this awkward update to keep things consistent.
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            })?;
-
-        {
+        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
-            for (tid, az_id) in updated {
-                if let Some(shard) = locked.tenants.get_mut(&tid) {
-                    shard.set_preferred_az(az_id);
-                }
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            let mut response_shards = Vec::new();
+            let mut schcedule_error = None;
+
+            for tenant_shard_id in create_ids {
+                tracing::info!("Creating shard {tenant_shard_id}...");
+
+                use std::collections::btree_map::Entry;
+                match tenants.entry(tenant_shard_id) {
+                    Entry::Occupied(mut entry) => {
+                        tracing::info!(
+                            "Tenant shard {tenant_shard_id} already exists while creating"
+                        );
+
+                        // TODO: schedule() should take an anti-affinity expression that pushes
+                        // attached and secondary locations (independently) away frorm those
+                        // pageservers also holding a shard for this tenant.
+
+                        entry
+                            .get_mut()
+                            .schedule(scheduler, &mut schedule_context)
+                            .map_err(|e| {
+                                ApiError::Conflict(format!(
+                                    "Failed to schedule shard {tenant_shard_id}: {e}"
+                                ))
+                            })?;
+
+                        if let Some(node_id) = entry.get().intent.get_attached() {
+                            let generation = entry
+                                .get()
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
+
+                        continue;
+                    }
+                    Entry::Vacant(entry) => {
+                        let state = entry.insert(TenantShard::new(
+                            tenant_shard_id,
+                            ShardIdentity::from_params(
+                                tenant_shard_id.shard_number,
+                                &create_req.shard_parameters,
+                            ),
+                            placement_policy.clone(),
+                        ));
+
+                        state.generation = initial_generation;
+                        state.config = create_req.config.clone();
+                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
+                            schcedule_error = Some(e);
+                        }
+
+                        // Only include shards in result if we are attaching: the purpose
+                        // of the response is to tell the caller where the shards are attached.
+                        if let Some(node_id) = state.intent.get_attached() {
+                            let generation = state
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
+                    }
+                };
            }
-        }

-        // If we failed to schedule shards, then they are still created in the controller,
-        // but we return an error to the requester to avoid a silent failure when someone
-        // tries to e.g. create a tenant whose placement policy requires more nodes than
-        // are present in the system.  We do this here rather than in the above loop, to
-        // avoid situations where we only create a subset of shards in the tenant.
-        if let Some(e) = schedule_error {
-            return Err(ApiError::Conflict(format!(
-                "Failed to schedule shard(s): {e}"
-            )));
-        }
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }

-        let waiters = {
-            let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
-            tenants
+            let waiters = tenants
                .range_mut(TenantShardId::tenant_range(tenant_id))
                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
-                .collect::<Vec<_>>()
+                .collect::<Vec<_>>();
+            (waiters, response_shards)
        };

        Ok((
@@ -2149,78 +2149,6 @@ impl Service {
        ))
    }

-    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
-    /// case of a new tenant and a pre-existing one.
-    async fn do_initial_shard_scheduling(
-        &self,
-        tenant_shard_id: TenantShardId,
-        initial_generation: Option<Generation>,
-        shard_params: &ShardParameters,
-        config: TenantConfig,
-        placement_policy: PlacementPolicy,
-        schedule_context: &mut ScheduleContext,
-    ) -> InitialShardScheduleOutcome {
-        let mut locked = self.inner.write().unwrap();
-        let (_nodes, tenants, scheduler) = locked.parts_mut();
-
-        use std::collections::btree_map::Entry;
-        match tenants.entry(tenant_shard_id) {
-            Entry::Occupied(mut entry) => {
-                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
-
-                // TODO: schedule() should take an anti-affinity expression that pushes
-                // attached and secondary locations (independently) away frorm those
-                // pageservers also holding a shard for this tenant.
-
-                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
-                    return InitialShardScheduleOutcome::ShardScheduleError(err);
-                }
-
-                if let Some(node_id) = entry.get().intent.get_attached() {
-                    let generation = entry
-                        .get()
-                        .generation
-                        .expect("Generation is set when in attached mode");
-                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
-                        shard_id: tenant_shard_id,
-                        node_id: *node_id,
-                        generation: generation.into().unwrap(),
-                    })
-                } else {
-                    InitialShardScheduleOutcome::NotScheduled
-                }
-            }
-            Entry::Vacant(entry) => {
-                let state = entry.insert(TenantShard::new(
-                    tenant_shard_id,
-                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
-                    placement_policy,
-                ));
-
-                state.generation = initial_generation;
-                state.config = config;
-                if let Err(e) = state.schedule(scheduler, schedule_context) {
-                    return InitialShardScheduleOutcome::ShardScheduleError(e);
-                }
-
-                // Only include shards in result if we are attaching: the purpose
-                // of the response is to tell the caller where the shards are attached.
-                if let Some(node_id) = state.intent.get_attached() {
-                    let generation = state
-                        .generation
-                        .expect("Generation is set when in attached mode");
-                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
-                        shard_id: tenant_shard_id,
-                        node_id: *node_id,
-                        generation: generation.into().unwrap(),
-                    })
-                } else {
-                    InitialShardScheduleOutcome::NotScheduled
-                }
-            }
-        }
-    }
-
    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
    /// wait for reconciliation to complete before responding.
    async fn await_waiters(
@@ -3198,57 +3126,6 @@ impl Service {
        }).await?
    }

-    pub(crate) async fn tenant_timeline_block_unblock_gc(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<(), ApiError> {
-        let _tenant_lock = trace_shared_lock(
-            &self.tenant_op_locks,
-            tenant_id,
-            TenantOperations::TimelineGcBlockUnblock,
-        )
-        .await;
-
-        self.tenant_remote_mutation(tenant_id, move |targets| async move {
-            if targets.is_empty() {
-                return Err(ApiError::NotFound(
-                    anyhow::anyhow!("Tenant not found").into(),
-                ));
-            }
-
-            async fn do_one(
-                tenant_shard_id: TenantShardId,
-                timeline_id: TimelineId,
-                node: Node,
-                jwt: Option<String>,
-                dir: BlockUnblock,
-            ) -> Result<(), ApiError> {
-                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-
-                client
-                    .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
-                    .await
-                    .map_err(|e| passthrough_api_error(&node, e))
-            }
-
-            // no shard needs to go first/last; the operation should be idempotent
-            self.tenant_for_shards(targets, |tenant_shard_id, node| {
-                futures::FutureExt::boxed(do_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                    dir,
-                ))
-            })
-            .await
-        })
-        .await??;
-        Ok(())
-    }
-
    /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
    ///
    /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
@@ -3634,7 +3511,6 @@ impl Service {
                is_pending_compute_notification: shard.pending_compute_notification,
                is_splitting: matches!(shard.splitting, SplitState::Splitting),
                scheduling_policy: *shard.get_scheduling_policy(),
-                preferred_az_id: shard.preferred_az().map(ToString::to_string),
            })
        }

@@ -4238,7 +4114,7 @@ impl Service {
        let policy = policy.unwrap();
        let config = config.unwrap();

-        Ok(ShardSplitAction::Split(Box::new(ShardSplitParams {
+        Ok(ShardSplitAction::Split(ShardSplitParams {
            old_shard_count,
            new_shard_count: ShardCount::new(split_req.new_shard_count),
            new_stripe_size: split_req.new_stripe_size,
@@ -4246,13 +4122,13 @@ impl Service {
            policy,
            config,
            shard_ident,
-        })))
+        }))
    }

    async fn do_tenant_shard_split(
        &self,
        tenant_id: TenantId,
-        params: Box<ShardSplitParams>,
+        params: ShardSplitParams,
    ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
@@ -4268,7 +4144,7 @@ impl Service {
            policy,
            config,
            shard_ident,
-        } = *params;
+        } = params;

        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
        // end-state for a split tenant will usually be to have secondary locations on different nodes.
@@ -4338,10 +4214,9 @@ impl Service {
                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,

-                    // Scheduling policies and preferred AZ do not carry through to children
+                    // Scheduling policies do not carry through to children
                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                        .unwrap(),
-                    preferred_az_id: None,
                });
            }

@@ -4461,47 +4336,6 @@ impl Service {
        let (response, child_locations, waiters) =
            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);

-        // Now that we have scheduled the child shards, attempt to set their preferred AZ
-        // to that of the pageserver they've been attached on.
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            child_locations
-                .iter()
-                .filter_map(|(tid, node_id, _stripe_size)| {
-                    let az_id = locked
-                        .nodes
-                        .get(node_id)
-                        .map(|n| n.get_availability_zone_id().to_string())?;
-
-                    Some((*tid, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            });
-
-        match updated {
-            Ok(updated) => {
-                let mut locked = self.inner.write().unwrap();
-                for (tid, az_id) in updated {
-                    if let Some(shard) = locked.tenants.get_mut(&tid) {
-                        shard.set_preferred_az(az_id);
-                    }
-                }
-            }
-            Err(err) => {
-                tracing::warn!("Failed to persist preferred AZs after split: {err}");
-            }
-        }
-
        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
        for (child_id, child_ps, stripe_size) in child_locations {
@@ -4976,45 +4810,6 @@ impl Service {
            ))
    }

-    pub(crate) async fn get_node_shards(
-        &self,
-        node_id: NodeId,
-    ) -> Result<NodeShardResponse, ApiError> {
-        let locked = self.inner.read().unwrap();
-        let mut shards = Vec::new();
-        for (tid, tenant) in locked.tenants.iter() {
-            let is_intended_secondary = match (
-                tenant.intent.get_attached() == &Some(node_id),
-                tenant.intent.get_secondary().contains(&node_id),
-            ) {
-                (true, true) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "{} attached as primary+secondary on the same node",
-                        tid
-                    )))
-                }
-                (true, false) => Some(false),
-                (false, true) => Some(true),
-                (false, false) => None,
-            };
-            let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) =
-                tenant.observed.locations.get(&node_id)
-            {
-                Some(conf.secondary_conf.is_some())
-            } else {
-                None
-            };
-            if is_intended_secondary.is_some() || is_observed_secondary.is_some() {
-                shards.push(NodeShard {
-                    tenant_shard_id: *tid,
-                    is_intended_secondary,
-                    is_observed_secondary,
-                });
-            }
-        }
-        Ok(NodeShardResponse { node_id, shards })
-    }
-
    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
        self.persistence.get_leader().await
    }
@@ -6702,35 +6497,4 @@ impl Service {
    ) -> Result<(), DatabaseError> {
        self.persistence.safekeeper_upsert(record).await
    }
-
-    pub(crate) async fn update_shards_preferred_azs(
-        &self,
-        req: ShardsPreferredAzsRequest,
-    ) -> Result<ShardsPreferredAzsResponse, ApiError> {
-        let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred AZs: {err}"
-                ))
-            })?;
-
-        let mut updated_in_mem_and_db = Vec::default();
-
-        let mut locked = self.inner.write().unwrap();
-        for (tid, az_id) in updated {
-            let shard = locked.tenants.get_mut(&tid);
-            if let Some(shard) = shard {
-                shard.set_preferred_az(az_id);
-                updated_in_mem_and_db.push(tid);
-            }
-        }
-
-        Ok(ShardsPreferredAzsResponse {
-            updated: updated_in_mem_and_db,
-        })
-    }
 }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -140,10 +140,6 @@ pub(crate) struct TenantShard {
    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
    // be set to a non-active state to avoid making changes while the issue is fixed.
    scheduling_policy: ShardSchedulingPolicy,
-
-    // We should attempt to schedule this shard in the provided AZ to
-    // decrease chances of cross-AZ compute.
-    preferred_az_id: Option<String>,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -467,7 +463,6 @@ impl TenantShard {
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id: None,
        }
    }

@@ -1302,7 +1297,6 @@ impl TenantShard {
            pending_compute_notification: false,
            delayed_reconcile: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-            preferred_az_id: tsp.preferred_az_id,
        })
    }

@@ -1318,17 +1312,8 @@ impl TenantShard {
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
-            preferred_az_id: self.preferred_az_id.clone(),
        }
    }
-
-    pub(crate) fn preferred_az(&self) -> Option<&str> {
-        self.preferred_az_id.as_deref()
-    }
-
-    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
-        self.preferred_az_id = Some(preferred_az_id);
-    }
 }

 #[cfg(test)]
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -1,7 +1,6 @@
-//! Functionality for finding and purging garbage, as in "garbage collection".
-//!
-//! Garbage means S3 objects which are either not referenced by any metadata,
-//! or are referenced by a control plane tenant/timeline in a deleted state.
+//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
+//! S3 objects which are either not referenced by any metadata, or are referenced by a
+//! control plane tenant/timeline in a deleted state.

 use std::{
    collections::{HashMap, HashSet},
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -74,9 +74,7 @@ pub async fn stream_tenant_shards<'a>(
 }

 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing.
-///
-/// The listing is done before the stream is built, so that this
+/// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
    remote_client: &'a GenericRemoteStorage,
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -440,10 +440,9 @@ async fn gc_ancestor(
    Ok(())
 }

-/// Physical garbage collection: removing unused S3 objects.
-///
-/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
-/// (keys, layers).  This type of garbage collection is about removing:
+/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
+/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
+/// is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
 /// - Index objects from historic generations
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -140,14 +140,6 @@ class TenantId(Id):
        return self.id.hex()


-class NodeId(Id):
-    def __repr__(self) -> str:
-        return f'`NodeId("{self.id.hex()}")'
-
-    def __str__(self) -> str:
-        return self.id.hex()
-
-
 class TimelineId(Id):
    def __repr__(self) -> str:
        return f'TimelineId("{self.id.hex()}")'
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -62,7 +62,7 @@ from urllib3.util.retry import Retry

 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
-from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -2560,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

    def tenant_describe(self, tenant_id: TenantId):
        """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
        """
        response = self.request(
            "GET",
@@ -2570,30 +2570,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
        response.raise_for_status()
        return response.json()

-    def nodes(self):
-        """
-        :return: list of {"id": ""}
-        """
-        response = self.request(
-            "GET",
-            f"{self.api}/control/v1/node",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        response.raise_for_status()
-        return response.json()
-
-    def node_shards(self, node_id: NodeId):
-        """
-        :return: list of {"shard_id": "", "is_secondary": bool}
-        """
-        response = self.request(
-            "GET",
-            f"{self.api}/control/v1/node/{node_id}/shards",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        response.raise_for_status()
-        return response.json()
-
    def tenant_shard_split(
        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
    ) -> list[TenantShardId]:
@@ -2910,17 +2886,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
                return None
            raise e

-    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
-        response = self.request(
-            "PUT",
-            f"{self.api}/control/v1/preferred_azs",
-            headers=self.headers(TokenScope.ADMIN),
-            json={str(tid): az for tid, az in preferred_azs.items()},
-        )
-
-        response.raise_for_status()
-        return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
-
    def __enter__(self) -> "NeonStorageController":
        return self

--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -50,19 +50,6 @@ class SafekeeperMetrics(Metrics):
        ).value


-@dataclass
-class TermBumpResponse:
-    previous_term: int
-    current_term: int
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse":
-        return TermBumpResponse(
-            previous_term=d["previous_term"],
-            current_term=d["current_term"],
-        )
-
-
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
    HTTPError = requests.HTTPError

@@ -265,22 +252,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        res.raise_for_status()
        return res.json()

-    def term_bump(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        term: Optional[int],
-    ) -> TermBumpResponse:
-        body = {}
-        if term is not None:
-            body["term"] = term
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump",
-            json=body,
-        )
-        res.raise_for_status()
-        return TermBumpResponse.from_json(res.json())
-
    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
        res = self.post(
            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1552,12 +1552,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
    literal_shard_count = 1 if shard_count is None else shard_count
    assert len(describe["shards"]) == literal_shard_count

-    nodes = env.storage_controller.nodes()
-    assert len(nodes) == 2
-    describe1 = env.storage_controller.node_shards(nodes[0]["id"])
-    describe2 = env.storage_controller.node_shards(nodes[1]["id"])
-    assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count
-
    # Check the data is still there: this implicitly proves that we recovered generation numbers
    # properly, for the timeline which was written to after a generation bump.
    for timeline, branch, expect_rows in [
@@ -2518,55 +2512,3 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
                del d[key]

    return compared[0] == compared[1]
-
-
-@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
-def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
-    def assign_az(ps_cfg):
-        az = f"az-{ps_cfg['id']}"
-        ps_cfg["availability_zone"] = az
-
-    neon_env_builder.pageserver_config_override = assign_az
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    tids = [TenantId.generate() for _ in range(0, 3)]
-    for tid in tids:
-        env.storage_controller.tenant_create(tid)
-
-        shards = env.storage_controller.tenant_describe(tid)["shards"]
-        assert len(shards) == 1
-        attached_to = shards[0]["node_attached"]
-        expected_az = env.get_pageserver(attached_to).az_id
-
-        assert shards[0]["preferred_az_id"] == expected_az
-
-    updated = env.storage_controller.set_preferred_azs(
-        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
-    )
-
-    assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
-
-    for tid in tids:
-        shards = env.storage_controller.tenant_describe(tid)["shards"]
-        assert len(shards) == 1
-        assert shards[0]["preferred_az_id"] == "foo"
-
-    # Generate a layer to avoid shard split handling on ps from tripping
-    # up on debug assert.
-    timeline_id = TimelineId.generate()
-    env.neon_cli.create_timeline("bar", tids[0], timeline_id)
-
-    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
-
-    env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
-    shards = env.storage_controller.tenant_describe(tids[0])["shards"]
-    assert len(shards) == 2
-    for shard in shards:
-        attached_to = shard["node_attached"]
-        expected_az = env.get_pageserver(attached_to).az_id
-        assert shard["preferred_az_id"] == expected_az
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,32 +1,17 @@
 import time
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-from typing import List, Optional

-import pytest
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    LogCursor,
    NeonEnvBuilder,
-    NeonPageserver,
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404


-@pytest.mark.parametrize("sharded", [True, False])
-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
-    neon_env_builder.num_pageservers = 2 if sharded else 1
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
-        initial_tenant_shard_count=2 if sharded else None,
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
    )
-
-    if sharded:
-        http = env.storage_controller.pageserver_api()
-    else:
-        http = env.pageserver.http_client()
-
-    pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers)))
+    ps = env.pageserver
+    http = ps.http_client()

    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)

@@ -37,8 +22,9 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
    tenant_before = http.tenant_status(env.initial_tenant)

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_active_line)
-    pss.assert_log_does_not_contain(gc_skipped_line)
+    _, offset = ps.assert_log_contains(gc_active_line)
+
+    assert ps.log_contains(gc_skipped_line, offset) is None

    http.timeline_block_gc(env.initial_tenant, foo_branch)

@@ -48,78 +34,34 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_skipped_line)
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)

-    pss.restart()
-    pss.quiesce_tenants()
+    ps.restart()
+    ps.quiesce_tenants()

-    pss.assert_log_contains(init_gc_skipped)
+    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_skipped_line)
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)

    # deletion unblocks gc
    http.timeline_delete(env.initial_tenant, foo_branch)
    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_active_line)
+    _, offset = ps.assert_log_contains(gc_active_line, offset)

    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_skipped_line)
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)

    # removing the manual block also unblocks gc
    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)

    wait_for_another_gc_round()
-    pss.assert_log_contains(gc_active_line)
+    _, offset = ps.assert_log_contains(gc_active_line, offset)


 def wait_for_another_gc_round():
    time.sleep(2)
-
-
-@dataclass
-class ScrollableLog:
-    pageserver: NeonPageserver
-    offset: Optional[LogCursor]
-
-    def assert_log_contains(self, what: str):
-        msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
-        old = self.offset
-        self.offset = offset
-        log.info(f"{old} -> {offset}: {msg}")
-
-    def assert_log_does_not_contain(self, what: str):
-        assert self.pageserver.log_contains(what) is None
-
-
-@dataclass(frozen=True)
-class ManyPageservers:
-    many: List[ScrollableLog]
-
-    def assert_log_contains(self, what: str):
-        for one in self.many:
-            one.assert_log_contains(what)
-
-    def assert_log_does_not_contain(self, what: str):
-        for one in self.many:
-            one.assert_log_does_not_contain(what)
-
-    def restart(self):
-        def do_restart(x: ScrollableLog):
-            x.pageserver.restart()
-
-        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
-            rt.map(do_restart, self.many)
-            rt.shutdown(wait=True)
-
-    def quiesce_tenants(self):
-        def do_quiesce(x: ScrollableLog):
-            x.pageserver.quiesce_tenants()
-
-        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
-            rt.map(do_quiesce, self.many)
-            rt.shutdown(wait=True)
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1057,24 +1057,6 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
        endpoint.start()


-# Try restarting endpoint immediately after xlog switch.
-# https://github.com/neondatabase/neon/issues/8911
-def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-
-    endpoint = env.endpoints.create_start("main")
-
-    endpoint.safe_psql("create table t (i int)")
-
-    endpoint.safe_psql("SELECT pg_switch_wal()")
-
-    # we want immediate shutdown to have endpoint restart on xlog switch record,
-    # so prevent shutdown checkpoint.
-    endpoint.stop(mode="immediate")
-    endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("SELECT 'works'")
-
-
 # Context manager which logs passed time on exit.
 class DurationLogger:
    def __init__(self, desc):
@@ -2194,43 +2176,6 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"


-def test_term_bump(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_safekeepers = 1
-    env = neon_env_builder.init_start()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    endpoint = env.endpoints.create_start("main")
-    # initialize safekeeper
-    endpoint.safe_psql("create table t(key int, value text)")
-
-    http_cli = env.safekeepers[0].http_client()
-
-    # check that bump up to specific term works
-    curr_term = http_cli.timeline_status(tenant_id, timeline_id).term
-    bump_to = curr_term + 3
-    res = http_cli.term_bump(tenant_id, timeline_id, bump_to)
-    log.info(f"bump to {bump_to} res: {res}")
-    assert res.current_term >= bump_to
-
-    # check that bump to none increments current term
-    res = http_cli.term_bump(tenant_id, timeline_id, None)
-    log.info(f"bump to None res: {res}")
-    assert res.current_term > bump_to
-    assert res.current_term > res.previous_term
-
-    # check that bumping doesn't work downward
-    res = http_cli.term_bump(tenant_id, timeline_id, 2)
-    log.info(f"bump to 2 res: {res}")
-    assert res.current_term > bump_to
-    assert res.current_term == res.previous_term
-
-    # check that this doesn't kill endpoint because last WAL flush was his and
-    # thus its basebackup is still good
-    endpoint.safe_psql("insert into t values (1, 'payload')")
-
-
 # Test disables periodic pushes from safekeeper to the broker and checks that
 # pageserver can still discover safekeepers with discovery requests.
 def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -8,8 +8,6 @@ version = "0.1.0"
 description = "workspace-hack package, managed by hakari"
 # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing.
 publish = false
-edition.workspace = true
-license.workspace = true
 # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
 # are managed by hakari.
Author	SHA1	Message	Date
Joonas Koivunen	49e72c8564	kick ci with label	2024-09-06 21:05:43 +00:00
Joonas Koivunen	42524a19b8	bench: sync before benchies the first benchmark repeatedly fails: <https://neon-github-public-dev.s3.amazonaws.com/reports/main/10743838300/index.html#behaviors/b1a8273437954620fa374b796ffaacdd/6e88523163820fdd/>	2024-09-06 21:04:29 +00:00
				`@@ -1 +0,0 @@`
				`ALTER TABLE tenant_shards DROP preferred_az_id;`
				`@@ -1 +0,0 @@`
				`ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;`