remove "next" info_span!, it showed up in flamegraphs (not too meaningful though)

add JustReadBoth validation mode
revert preceding two WIP parallel mode efforts
2026-05-26 01:20:38 +00:00 · 2024-08-16 14:40:33 +00:00 · 2024-08-16 13:34:01 +00:00 · 2024-08-16 12:26:30 +02:00 · 2024-08-16 12:26:30 +02:00 · 2024-08-16 12:26:30 +02:00
203 changed files with 2603 additions and 5757 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,30 +23,10 @@ platforms = [
 ]

 [final-excludes]
-workspace-members = [
-    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-    # from depending on workspace-hack because most of the dependencies are not used.
-    "vm_monitor",
-    # All of these exist in libs and are not usually built independently.
-    # Putting workspace hack there adds a bottleneck for cargo builds.
-    "compute_api",
-    "consumption_metrics",
-    "desim",
-    "metrics",
-    "pageserver_api",
-    "postgres_backend",
-    "postgres_connection",
-    "postgres_ffi",
-    "pq_proto",
-    "remote_storage",
-    "safekeeper_api",
-    "tenant_size_model",
-    "tracing-utils",
-    "utils",
-    "wal_craft",
-    "walproposer",
-]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]

 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
  pg_version:
    description: 'Postgres version to use for tests'
    required: false
-    default: 'v16'
+    default: 'v14'
  benchmark_durations:
    description: 'benchmark durations JSON'
    required: false
@@ -169,8 +169,10 @@ runs:
          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi

-        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
+        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
+          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,8 +48,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  

-    - uses: actions/checkout@v4
-
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,16 +94,11 @@ jobs:
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
      - name: Set env variables
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
@@ -163,8 +158,6 @@ jobs:
      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -179,7 +172,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -217,9 +210,7 @@ jobs:
          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

          for io_engine in std-fs tokio-epoll-uring ; do
-            for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-            done
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
          done

          # Run separate tests for real S3
@@ -252,8 +243,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    # Don't run regression tests on debug arm64 builds
-    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    # Run test on x64 only
+    if: inputs.arch == 'x64'
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -222,20 +222,13 @@ jobs:
      id: create-allure-report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
-      with:
-        store-test-results-into-db: true
-      env:
-        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
-        channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
-        slack-message: |
-          Periodic replication testing: ${{ job.status }}
-          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
-          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

@@ -337,7 +330,7 @@ jobs:
  prepare_AWS_RDS_databases:
    uses: ./.github/workflows/_benchmarking_preparation.yml
    secrets: inherit
-
+  
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices, prepare_AWS_RDS_databases ]
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64, arm64 ]
+        arch: [ x64 ]
        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
        include:
@@ -280,7 +280,6 @@ jobs:
          save_perf_report: ${{ github.ref_name == 'main' }}
          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
-          pg_version: v16
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -986,10 +985,10 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
@@ -999,14 +998,14 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
@@ -1016,7 +1015,7 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f branch=main \
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -4,7 +4,7 @@ on:
  issues:
    types:
      - opened
-  pull_request_target:
+  pull_request:
    types:
      - opened

@@ -25,7 +25,7 @@ jobs:
    - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
      id: check-user
      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      run: |
        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
          is_member=true
@@ -45,10 +45,10 @@ jobs:
      issues: write        # for `gh issue edit`

    steps:
-    - name: Add `${{ env.LABEL }}` label
+    - name: Label new ${{ github.event_name }}
      env:
        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
      run: |
        gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,6 +1208,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1320,6 +1321,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1668,13 +1670,14 @@ dependencies = [
 "smallvec",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
 name = "diesel"
-version = "2.2.3"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -3144,6 +3147,7 @@ dependencies = [
 "rand 0.8.5",
 "rand_distr",
 "twox-hash",
+ "workspace_hack",
 ]

 [[package]]
@@ -3787,6 +3791,7 @@ dependencies = [
 "strum_macros",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -4188,6 +4193,7 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4200,6 +4206,7 @@ dependencies = [
 "postgres",
 "tokio-postgres",
 "url",
+ "workspace_hack",
 ]

 [[package]]
@@ -4222,6 +4229,7 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -4259,6 +4267,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4823,6 +4832,7 @@ dependencies = [
 "toml_edit 0.19.10",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5347,6 +5357,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5590,12 +5601,11 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.125"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
 dependencies = [
 "itoa",
- "memchr",
 "ryu",
 "serde",
 ]
@@ -6183,6 +6193,7 @@ dependencies = [
 "anyhow",
 "serde",
 "serde_json",
+ "workspace_hack",
 ]

 [[package]]
@@ -6783,6 +6794,7 @@ dependencies = [
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
@@ -7000,6 +7012,7 @@ dependencies = [
 "url",
 "uuid",
 "walkdir",
+ "workspace_hack",
 ]

 [[package]]
@@ -7078,6 +7091,7 @@ dependencies = [
 "postgres_ffi",
 "regex",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7098,6 +7112,7 @@ dependencies = [
 "bindgen",
 "postgres_ffi",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7654,6 +7669,8 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
+ "toml_datetime",
+ "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.


 #### Running neon database
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -379,7 +379,7 @@ where
    }
 }

-pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
    match kill(pid, None) {
        // Process exists, keep waiting
        Ok(_) => Ok(false),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,9 +15,7 @@ use control_plane::local_env::{
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage_controller::{
-    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
-};
+use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -54,7 +52,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "16";
+const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
    humantime_duration.as_ref()
 }

-fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-
-    let base_port = args.get_one::<u16>("base-port");
-
-    if maybe_instance_id.is_some() && base_port.is_none() {
-        panic!("storage-controller start specificied instance-id but did not provide base-port");
-    }
-
-    let start_timeout = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-
-    NeonStorageControllerStartArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        base_port: base_port.copied(),
-        start_timeout: *start_timeout,
-    }
-}
-
-fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
-
-    NeonStorageControllerStopArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        immediate,
-    }
-}
-
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
        Some(("start", start_match)) => {
-            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
+            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
                eprintln!("start failed: {e}");
                exit(1);
            }
        }

        Some(("stop", stop_match)) => {
-            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
+            let immediate = stop_match
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");
+
+            if let Err(e) = svc.stop(immediate).await {
                eprintln!("stop failed: {}", e);
                exit(1);
            }
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller
-            .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                (*retry_timeout).into(),
-            ))
-            .await
-        {
+        if let Err(e) = storage_controller.start(retry_timeout).await {
            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        eprintln!("neon broker stop failed: {e:#}");
    }

-    // Stop all storage controller instances. In the most common case there's only one,
-    // but iterate though the base data directory in order to discover the instances.
-    let storcon_instances = env
-        .storage_controller_instances()
-        .await
-        .expect("Must inspect data dir");
-    for (instance_id, _instance_dir_path) in storcon_instances {
+    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        let stop_args = NeonStorageControllerStopArgs {
-            instance_id,
-            immediate,
-        };
-
-        if let Err(e) = storage_controller.stop(stop_args).await {
-            eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
+        if let Err(e) = storage_controller.stop(immediate).await {
+            eprintln!("storage controller stop failed: {e:#}");
        }
    }
 }
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
        .action(ArgAction::SetTrue)
        .required(false);

-    let instance_id = Arg::new("instance-id")
-        .long("instance-id")
-        .help("Identifier used to distinguish storage controller instances (default 1)")
-        .value_parser(value_parser!(u8))
-        .required(false);
-
-    let base_port = Arg::new("base-port")
-        .long("base-port")
-        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
-        .value_parser(value_parser!(u16))
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage storage_controller")
                .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone())
-                            .arg(instance_id.clone())
-                            .arg(base_port))
+                            .arg(timeout_arg.clone()))
                .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone())
-                            .arg(instance_id))
+                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
            Command::new("safekeeper")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 //
 // This data structures represents neon_local CLI config
@@ -156,11 +156,6 @@ pub struct NeonStorageControllerConf {
    #[serde(with = "humantime_serde")]
    pub max_warming_up: Duration,

-    pub start_as_candidate: bool,
-
-    /// Database url used when running multiple storage controller instances
-    pub database_url: Option<SocketAddr>,
-
    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,

@@ -179,8 +174,6 @@ impl Default for NeonStorageControllerConf {
        Self {
            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
-            start_as_candidate: false,
-            database_url: None,
            split_threshold: None,
            max_secondary_lag_bytes: None,
        }
@@ -399,36 +392,6 @@ impl LocalEnv {
        }
    }

-    /// Inspect the base data directory and extract the instance id and instance directory path
-    /// for all storage controller instances
-    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
-        let mut instances = Vec::default();
-
-        let dir = std::fs::read_dir(self.base_data_dir.clone())?;
-        for dentry in dir {
-            let dentry = dentry?;
-            let is_dir = dentry.metadata()?.is_dir();
-            let filename = dentry.file_name().into_string().unwrap();
-            let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
-                Some(suffix) => suffix.parse::<u8>().ok(),
-                None => None,
-            };
-
-            let is_instance_dir = is_dir && parsed_instance_id.is_some();
-
-            if !is_instance_dir {
-                continue;
-            }
-
-            instances.push((
-                parsed_instance_id.expect("Checked previously"),
-                dentry.path(),
-            ));
-        }
-
-        Ok(instances)
-    }
-
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,8 +3,6 @@ use crate::{
    local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper::Uri;
-use nix::unistd::Pid;
 use pageserver_api::{
    controller_api::{
        NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
+use std::{fs, str::FromStr, time::Duration};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -31,14 +29,12 @@ use utils::{

 pub struct StorageController {
    env: LocalEnv,
+    listen: String,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
+    postgres_port: u16,
    client: reqwest::Client,
    config: NeonStorageControllerConf,
-
-    // The listen addresses is learned when starting the storage controller,
-    // hence the use of OnceLock to init it at the right time.
-    listen: OnceLock<SocketAddr>,
 }

 const COMMAND: &str = "storage_controller";
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

 const DB_NAME: &str = "storage_controller";

-pub struct NeonStorageControllerStartArgs {
-    pub instance_id: u8,
-    pub base_port: Option<u16>,
-    pub start_timeout: humantime::Duration,
-}
-
-impl NeonStorageControllerStartArgs {
-    pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
-        Self {
-            instance_id: 1,
-            base_port: None,
-            start_timeout,
-        }
-    }
-}
-
-pub struct NeonStorageControllerStopArgs {
-    pub instance_id: u8,
-    pub immediate: bool,
-}
-
-impl NeonStorageControllerStopArgs {
-    pub fn with_default_instance_id(immediate: bool) -> Self {
-        Self {
-            instance_id: 1,
-            immediate,
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -101,6 +67,23 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
+        // Makes no sense to construct this if pageservers aren't going to use it: assume
+        // pageservers have control plane API set
+        let listen_url = env.control_plane_api.clone().unwrap();
+
+        let listen = format!(
+            "{}:{}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        );
+
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
        // Assume all pageservers have symmetric auth configuration: this service
        // expects to use one JWT token to talk to all of them.
        let ps_conf = env
@@ -143,28 +126,20 @@ impl StorageController {

        Self {
            env: env.clone(),
+            listen,
            private_key,
            public_key,
+            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
            config: env.storage_controller.clone(),
-            listen: OnceLock::default(),
        }
    }

-    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
-        self.env
-            .base_data_dir
-            .join(format!("storage_controller_{}", instance_id))
-    }
-
-    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.storage_controller_instance_dir(instance_id)
-                .join("storage_controller.pid"),
-        )
-        .expect("non-Unicode path")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
+            .expect("non-Unicode path")
    }

    /// PIDFile for the postgres instance used to store storage controller state
@@ -209,23 +184,23 @@ impl StorageController {
    }

    /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;

        Ok(exitcode.success())
    }

-    /// Create our database if it doesn't exist
+    /// Create our database if it doesn't exist, and run migrations.
    ///
    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
    /// who just want to run `cargo neon_local` without knowing about diesel.
    ///
    /// Returns the database url
-    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let createdb_path = pg_bin_dir.join("createdb");
@@ -234,7 +209,7 @@ impl StorageController {
                "-h",
                "localhost",
                "-p",
-                &format!("{}", postgres_port),
+                &format!("{}", self.postgres_port),
                DB_NAME,
            ])
            .output()
@@ -255,14 +230,13 @@ impl StorageController {

    pub async fn connect_to_database(
        &self,
-        postgres_port: u16,
    ) -> anyhow::Result<(
        tokio_postgres::Client,
        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
    )> {
        tokio_postgres::Config::new()
            .host("localhost")
-            .port(postgres_port)
+            .port(self.postgres_port)
            // The user is the ambient operating system user name.
            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
            //
@@ -278,114 +252,72 @@ impl StorageController {
            .map_err(anyhow::Error::new)
    }

-    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
-        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
-        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
-            if err.kind() != std::io::ErrorKind::AlreadyExists {
-                panic!("Failed to create instance dir {instance_dir:?}");
-            }
-        }
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the storage controller for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");

-        let (listen, postgres_port) = {
-            if let Some(base_port) = start_args.base_port {
-                (
-                    format!("127.0.0.1:{base_port}"),
-                    self.config
-                        .database_url
-                        .expect("--base-port requires NeonStorageControllerConf::database_url")
-                        .port(),
-                )
-            } else {
-                let listen_url = self.env.control_plane_api.clone().unwrap();
-
-                let listen = format!(
-                    "{}:{}",
-                    listen_url.host_str().unwrap(),
-                    listen_url.port().unwrap()
-                );
-
-                (listen, listen_url.port().unwrap() + 1)
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
            }
        };

-        let socket_addr = listen
-            .parse()
-            .expect("listen address is a valid socket address");
-        self.listen
-            .set(socket_addr)
-            .expect("StorageController::listen is only set here");
+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;

-        // Do we remove the pid file on stop?
-        let pg_started = self.is_postgres_running().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
+        println!("Starting storage controller database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];

-        if !pg_started {
-            // Start a vanilla Postgres process used by the storage controller for persistence.
-            let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-                .unwrap()
-                .join("storage_controller_db");
-            let pg_bin_dir = self.get_pg_bin_dir().await?;
-            let pg_log_path = pg_data_path.join("postgres.log");
+        background_process::start_process(
+            "storage_controller_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            retry_timeout,
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;

-            if !tokio::fs::try_exists(&pg_data_path).await? {
-                // Initialize empty database
-                let initdb_path = pg_bin_dir.join("initdb");
-                let mut child = Command::new(&initdb_path)
-                    .envs(vec![
-                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ])
-                    .args(["-D", pg_data_path.as_ref()])
-                    .spawn()
-                    .expect("Failed to spawn initdb");
-                let status = child.wait().await?;
-                if !status.success() {
-                    anyhow::bail!("initdb failed with status {status}");
-                }
-            };
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            //
-            // NB: it's important that we rewrite this file on each start command so we propagate changes
-            // from `LocalEnv`'s config file (`.neon/config`).
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", postgres_port),
-            )
-            .await?;
-
-            println!("Starting storage controller database...");
-            let db_start_args = [
-                "-w",
-                "-D",
-                pg_data_path.as_ref(),
-                "-l",
-                pg_log_path.as_ref(),
-                "start",
-            ];
-
-            background_process::start_process(
-                "storage_controller_db",
-                &self.env.base_data_dir,
-                pg_bin_dir.join("pg_ctl").as_std_path(),
-                db_start_args,
-                vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ],
-                background_process::InitialPidFile::Create(self.postgres_pid_file()),
-                &start_args.start_timeout,
-                || self.pg_isready(&pg_bin_dir, postgres_port),
-            )
-            .await?;
-
-            self.setup_database(postgres_port).await?;
-        }
-
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;

        // We support running a startup SQL script to fiddle with the database before we launch storcon.
        // This is used by the test suite.
@@ -407,7 +339,7 @@ impl StorageController {
                }
            }
        };
-        let (mut client, conn) = self.connect_to_database(postgres_port).await?;
+        let (mut client, conn) = self.connect_to_database().await?;
        let conn = tokio::spawn(conn);
        let tx = client.build_transaction();
        let tx = tx.start().await?;
@@ -416,20 +348,9 @@ impl StorageController {
        drop(client);
        conn.await??;

-        let listen = self
-            .listen
-            .get()
-            .expect("cell is set earlier in this function");
-        let address_for_peers = Uri::builder()
-            .scheme("http")
-            .authority(format!("{}:{}", listen.ip(), listen.port()))
-            .path_and_query("")
-            .build()
-            .unwrap();
-
        let mut args = vec![
            "-l",
-            &listen.to_string(),
+            &self.listen,
            "--dev",
            "--database-url",
            &database_url,
@@ -437,27 +358,15 @@ impl StorageController {
            &humantime::Duration::from(self.config.max_offline).to_string(),
            "--max-warming-up-interval",
            &humantime::Duration::from(self.config.max_warming_up).to_string(),
-            "--address-for-peers",
-            &address_for_peers.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-
-        if self.config.start_as_candidate {
-            args.push("--start-as-candidate".to_string());
-        }
-
        if let Some(private_key) = &self.private_key {
            let claims = Claims::new(None, Scope::PageServerApi);
            let jwt_token =
                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
-
-            let peer_claims = Claims::new(None, Scope::Admin);
-            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
-                .expect("failed to generate jwt token");
-            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
        }

        if let Some(public_key) = &self.public_key {
@@ -485,15 +394,15 @@ impl StorageController {

        background_process::start_process(
            COMMAND,
-            &instance_dir,
+            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
            vec![
                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ],
-            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
-            &start_args.start_timeout,
+            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
            || async {
                match self.ready().await {
                    Ok(_) => Ok(true),
@@ -506,35 +415,8 @@ impl StorageController {
        Ok(())
    }

-    pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
-        background_process::stop_process(
-            stop_args.immediate,
-            COMMAND,
-            &self.pid_file(stop_args.instance_id),
-        )?;
-
-        let storcon_instances = self.env.storage_controller_instances().await?;
-        for (instance_id, instanced_dir_path) in storcon_instances {
-            if instance_id == stop_args.instance_id {
-                continue;
-            }
-
-            let pid_file = instanced_dir_path.join("storage_controller.pid");
-            let pid = tokio::fs::read_to_string(&pid_file)
-                .await
-                .map_err(|err| {
-                    anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
-                })?
-                .parse::<i32>()
-                .expect("pid is valid i32");
-
-            let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
-            if other_proc_alive {
-                // There is another storage controller instance running, so we return
-                // and leave the database running.
-                return Ok(());
-            }
-        }
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;

        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -547,51 +429,27 @@ impl StorageController {
            .wait()
            .await?;
        if !stop_status.success() {
-            match self.is_postgres_running().await {
-                Ok(false) => {
-                    println!("Storage controller database is already stopped");
-                    return Ok(());
-                }
-                Ok(true) => {
-                    anyhow::bail!("Failed to stop storage controller database");
-                }
-                Err(err) => {
-                    anyhow::bail!("Failed to stop storage controller database: {err}");
-                }
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Storage controller database is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
            }
        }

        Ok(())
    }

-    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
-        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-
-        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_status_args)
-            .spawn()?
-            .wait()
-            .await?;
-
-        // pg_ctl status returns this exit code if postgres is not running: in this case it is
-        // fine that stop failed.  Otherwise it is an error that stop failed.
-        const PG_STATUS_NOT_RUNNING: i32 = 3;
-        const PG_NO_DATA_DIR: i32 = 4;
-        const PG_STATUS_RUNNING: i32 = 0;
-        match status_exitcode.code() {
-            Some(PG_STATUS_NOT_RUNNING) => Ok(false),
-            Some(PG_NO_DATA_DIR) => Ok(false),
-            Some(PG_STATUS_RUNNING) => Ok(true),
-            Some(code) => Err(anyhow::anyhow!(
-                "pg_ctl status returned unexpected status code: {:?}",
-                code
-            )),
-            None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
-        }
-    }
-
    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
        let category = match path.find('/') {
            Some(idx) => &path[..idx],
@@ -617,31 +475,15 @@ impl StorageController {
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
-        // In the special case of the `storage_controller start` subcommand, we wish
-        // to use the API endpoint of the newly started storage controller in order
-        // to pass the readiness check. In this scenario [`Self::listen`] will be set
-        // (see [`Self::start`]).
-        //
-        // Otherwise, we infer the storage controller api endpoint from the configured
-        // control plane API.
-        let url = if let Some(socket_addr) = self.listen.get() {
-            Url::from_str(&format!(
-                "http://{}:{}/{path}",
-                socket_addr.ip().to_canonical(),
-                socket_addr.port()
-            ))
-            .unwrap()
-        } else {
-            // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-            // for general purpose API access.
-            let listen_url = self.env.control_plane_api.clone().unwrap();
-            Url::from_str(&format!(
-                "http://{}:{}/{path}",
-                listen_url.host_str().unwrap(),
-                listen_url.port().unwrap()
-            ))
-            .unwrap()
-        };
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();

        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
    // outside of the specified set.
-    BulkMigrate {
+    Drain {
        // Set of pageserver node ids to drain.
        #[arg(long)]
        nodes: Vec<NodeId>,
@@ -163,34 +163,6 @@ enum Command {
        #[arg(long)]
        dry_run: Option<bool>,
    },
-    /// Start draining the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartDrain {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel draining the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelDrain {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
-    /// Start filling the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartFill {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel filling the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelFill {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
 }

 #[derive(Parser)]
@@ -277,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-async fn wait_for_scheduling_policy<F>(
-    client: Client,
-    node_id: NodeId,
-    timeout: Duration,
-    f: F,
-) -> anyhow::Result<NodeSchedulingPolicy>
-where
-    F: Fn(NodeSchedulingPolicy) -> bool,
-{
-    let waiter = tokio::time::timeout(timeout, async move {
-        loop {
-            let node = client
-                .dispatch::<(), NodeDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}"),
-                    None,
-                )
-                .await?;
-
-            if f(node.scheduling) {
-                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
-            }
-        }
-    });
-
-    Ok(waiter.await??)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -684,7 +628,7 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::BulkMigrate {
+        Command::Drain {
            nodes,
            concurrency,
            max_shards,
@@ -713,7 +657,7 @@ async fn main() -> anyhow::Result<()> {
            }

            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
+                anyhow::bail!("Drain requested for node which doesn't exist.")
            }

            node_to_fill_descs.retain(|desc| {
@@ -725,7 +669,7 @@ async fn main() -> anyhow::Result<()> {
            });

            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to migrate to")
+                anyhow::bail!("There are no nodes to drain to")
            }

            // Set the node scheduling policy to draining for the nodes which
@@ -746,7 +690,7 @@ async fn main() -> anyhow::Result<()> {
                    .await?;
            }

-            // Perform the migration: move each tenant shard scheduled on a node to
+            // Perform the drain: move each tenant shard scheduled on a node to
            // be drained to a node which is being filled. A simple round robin
            // strategy is used to pick the new node.
            let tenants = storcon_client
@@ -759,13 +703,13 @@ async fn main() -> anyhow::Result<()> {

            let mut selected_node_idx = 0;

-            struct MigrationMove {
+            struct DrainMove {
                tenant_shard_id: TenantShardId,
                from: NodeId,
                to: NodeId,
            }

-            let mut moves: Vec<MigrationMove> = Vec::new();
+            let mut moves: Vec<DrainMove> = Vec::new();

            let shards = tenants
                .into_iter()
@@ -795,7 +739,7 @@ async fn main() -> anyhow::Result<()> {
                    continue;
                }

-                moves.push(MigrationMove {
+                moves.push(DrainMove {
                    tenant_shard_id: shard.tenant_shard_id,
                    from: shard
                        .node_attached
@@ -872,67 +816,6 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
-        Command::StartDrain { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-            println!("Drain started for {node_id}");
-        }
-        Command::CancelDrain { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active | PauseForRestart)
-                })
-                .await?;
-
-            println!(
-                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
-        Command::StartFill { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
-                .await?;
-
-            println!("Fill started for {node_id}");
-        }
-        Command::CancelFill { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/fill"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active)
-                })
-                .await?;
-
-            println!(
-                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
    }

    Ok(())
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.

 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.

 This problem is not yet very acutely felt in storage controller managed pageservers since
--- a/docs/rfcs/036-physical-replication.md
+++ b/docs/rfcs/036-physical-replication.md
@@ -1,265 +0,0 @@
-# Physical Replication
-
-This RFC is a bit special in that we have already implemented physical
-replication a long time ago. However, we never properly wrote down all
-the decisions and assumptions, and in the last months when more users
-have started to use the feature, numerous issues have surfaced.
-
-This RFC documents the design decisions that have been made.
-
-## Summary
-
-PostgreSQL has a feature called streaming replication, where a replica
-streams WAL from the primary and continuously applies it. It is also
-known as "physical replication", to distinguish it from logical
-replication.  In PostgreSQL, a replica is initialized by taking a
-physical backup of the primary. In Neon, the replica is initialized
-from a slim "base backup" from the pageserver, just like a primary,
-and the primary and the replicas connect to the same pageserver,
-sharing the storage.
-
-There are two kinds of read-only replicas in Neon:
- replicas that follow the primary, and
- "static" replicas that are pinned at a particular LSN.
-
-A static replica is useful e.g. for performing time-travel queries and
-running one-off slow queries without affecting the primary. A replica
-that follows the primary can be used e.g. to scale out read-only
-workloads.
-
-## Motivation
-
-Read-only replicas allow offloading read-only queries. It's useful for
-isolation, if you want to make sure that read-only queries don't
-affect the primary, and it's also an easy way to provide guaranteed
-read-only access to an application, without having to mess with access
-controls.
-
-## Non Goals (if relevant)
-
-This RFC is all about WAL-based *physical* replication. Logical
-replication is a different feature.
-
-Neon also has the capability to launch "static" read-only nodes which
-do not follow the primary, but are pinned to a particular LSN. They
-can be used for long-running one-off queries, or for Point-in-time
-queries. They work similarly to read replicas that follow the primary,
-but some things are simpler: there are no concerns about cache
-invalidation when the data changes on the primary, or worrying about
-transactions that are in-progress on the primary.
-
-## Impacted components (e.g. pageserver, safekeeper, console, etc)
-
- Control plane launches the replica
- Replica Postgres instance connects to the safekeepers, to stream the WAL
- The primary does not know about the standby, except for the hot standby feedback
- The primary and replicas all connect to the same pageservers
-
-
-# Context
-
-Some useful things to know about hot standby and replicas in
-PostgreSQL.
-
-## PostgreSQL startup sequence
-
-"Running" and "start up" terms are little imprecise. PostgreSQL
-replica startup goes through several stages:
-
-1. First, the process is started up, and various initialization steps
-   are performed, like initializing shared memory. If you try to
-   connect to the server in this stage, you get an error: ERROR: the
-   database system is starting up. This stage happens very quickly, no
-
-2. Then the server reads the checpoint record from the WAL and starts
-   the WAL replay starting from the checkpoint. This works differently
-   in Neon: we start the WAL replay at the basebackup LSN, not from a
-   checkpoint! If you connect to the server in this state, you get an
-   error: ERROR: the database system is not yet accepting
-   connections. We proceed to the next stage, when the WAL replay sees
-   a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
-   can allow us to move directly to next stage, with all the caveats
-   listed in this RFC.
-
-3. When the running-xacts information is established, the server
-   starts to accept connections normally.
-
-From PostgreSQL's point of view, the server is already running in
-stage 2, even though it's not accepting connections yet. Our
-`compute_ctl` does not consider it as running until stage 3. If the
-transition from stage 2 to 3 doesn't happen fast enough, the control
-plane will mark the start operation as failed.
-
-
-## Decisions, Issues
-
-### Cache invalidation in replica
-
-When a read replica follows the primary in PostgreSQL, it needs to
-stream all the WAL from the primary and apply all the records, to keep
-the local copy of the data consistent with the primary. In Neon, the
-replica can fetch the updated page versions from the pageserver, so
-it's not necessary to apply all the WAL. However, it needs to ensure
-that any pages that are currently in the Postgres buffer cache, or the
-Local File Cache, are either updated, or thrown away so that the next
-read of the page will fetch the latest version.
-
-We choose to apply the WAL records for pages that are already in the
-buffer cache, and skip records for other pages. Somewhat arbitrarily,
-we also apply records affecting catalog relations, fetching the old
-page version from the pageserver if necessary first. See
-`neon_redo_read_buffer_filter()` function.
-
-The replica wouldn't necessarily need to see all the WAL records, only
-the records that apply to cached pages. For simplicity, we do stream
-all the WAL to the replica, and the replica simply ignores WAL records
-that require no action.
-
-Like in PostgreSQL, the read replica maintains a "replay LSN", which
-is the LSN up to which the replica has received and replayed the
-WAL. The replica can lag behind the primary, if it cannot quite keep
-up with the primary, or if a long-running query conflicts with changes
-that are about to be applied, or even intentionally if the user wishes
-to see delayed data (see recovery_min_apply_delay). It's important
-that the replica sees a consistent view of the whole cluster at the
-replay LSN, when it's lagging behind.
-
-In Neon, the replica connects to a safekeeper to get the WAL
-stream. That means that the safekeepers must be able to regurgitate
-the original WAL as far back as the replay LSN of any running read
-replica. (A static read-only node that does not follow the primary
-does not require a WAL stream however). The primary does not need to
-be running, and when it is, the replicas don't incur any extra
-overhead to the primary (see hot standby feedback though).
-
-### In-progress transactions
-
-In PostgreSQL, when a hot standby server starts up, it cannot
-immediately open up for queries (see [PostgreSQL startup
-sequence]). It first needs to establish a complete list of in-progress
-transactions, including subtransactions, that are running at the
-primary, at the current replay LSN. Normally that happens quickly,
-when the replica sees a "running-xacts" WAL record, because the
-primary writes a running-xacts WAL record at every checkpoint, and in
-PostgreSQL the replica always starts the WAL replay from a checkpoint
-REDO point. (A shutdown checkpoint WAL record also implies that all
-the non-prepared transactions have ended.) If there are a lot of
-subtransactions in progress, however, the standby might need to wait
-for old transactions to complete before it can open up for queries.
-
-In Neon that problem is worse: a replica can start at any LSN, so
-there's no guarantee that it will see a running-xacts record any time
-soon. In particular, if the primary is not running when the replica is
-started, it might never see a running-xacts record.
-
-To make things worse, we initially missed this issue, and always
-started accepting queries at replica startup, even if it didn't have
-the transaction information. That could lead to incorrect query
-results and data corruption later. However, as we fixed that, we
-introduced a new problem compared to what we had before: previously
-the replica would always start up, but after fixing that bug, it might
-not. In a superficial way, the old behavior was better (but could lead
-to serious issues later!). That made fixing that bug was very hard,
-because as we fixed it, we made things (superficially) worse for
-others.
-
-See https://github.com/neondatabase/neon/pull/7288 which fixed the
-bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
-and https://github.com/neondatabase/neon/pull/8484 to try to claw back
-the cases that started to cause trouble as fixing it. As of this
-writing, there are still cases where a replica might not immediately
-start up, causing the control plane operation to fail, the remaining
-issues are tracked in https://github.com/neondatabase/neon/issues/6211.
-
-One long-term fix for this is to switch to using so-called CSN
-snapshots in read replica. That would make it unnecessary to have the
-full in-progress transaction list in the replica at startup time. See
-https://commitfest.postgresql.org/48/4912/ for a work-in-progress
-patch to upstream to implement that.
-
-Another thing we could do is to teach the control plane about that
-distinction between "starting up" and "running but haven't received
-running-xacts information yet", so that we could keep the replica
-waiting longer in that stage, and also give any client connections the
-same `ERROR: the database system is not yet accepting connections`
-error that you get in standalone PostgreSQL in that state.
-
-
-### Recovery conflicts and Hot standby feedback
-
-It's possible that a tuple version is vacuumed away in the primary,
-even though it is still needed by a running transactions in the
-replica. This is called a "recovery conflict", and PostgreSQL provides
-various options for dealing with it. By default, the WAL replay will
-wait up to 30 s for the conflicting query to finish. After that, it
-will kill the running query, so that the WAL replay can proceed.
-
-Another way to avoid the situation is to enable the
-[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
-option. When it is enabled, the primary will refrain from vacuuming
-tuples that are still needed in the primary. That means potentially
-bloating the primary, which violates the usual rule that read replicas
-don't affect the operations on the primary, which is why it's off by
-default. We leave it to users to decide if they want to turn it on,
-same as PostgreSQL.
-
-Neon supports `hot_standby_feedback` by passing the feedback messages
-from the replica to the safekeepers, and from safekeepers to the
-primary.
-
-### Relationship of settings between primary and replica
-
-In order to enter hot standby mode, some configuration options need to
-be set to the same or larger values in the standby, compared to the
-primary.  See [explanation in the PostgreSQL
-docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
-
-In Neon, we have this problem too. To prevent customers from hitting
-it, the control plane automatically adjusts the settings of a replica,
-so that they match or exceed the primary's settings (see
-https://github.com/neondatabase/cloud/issues/14903). However, you
-can still hit the issue if the primary is restarted with larger
-settings, while the replica is running.
-
-
-### Interaction with Pageserver GC
-
-The read replica can lag behind the primary. If there are recovery
-conflicts or the replica cannot keep up for some reason, the lag can
-in principle grow indefinitely. The replica will issue all GetPage
-requests to the pageservers at the current replay LSN, and needs to
-see the old page versions.
-
-If the retention period in the pageserver is set to be small, it may
-have already garbage collected away the old page versions. That will
-cause read errors in the compute, and can mean that the replica cannot
-make progress with the replication anymore.
-
-There is a mechanism for replica to pass information about its replay
-LSN to the pageserver, so that the pageserver refrains from GC'ing
-data that is still needed by the standby. It's called
-'standby_horizon' in the pageserver code, see
-https://github.com/neondatabase/neon/pull/7368. A separate "lease"
-mechanism also is in the works, where the replica could hold a lease
-on the old LSN, preventing the pageserver from advancing the GC
-horizon past that point. The difference is that the standby_horizon
-mechanism relies on a feedback message from replica to safekeeper,
-while the least API is exposed directly from the pageserver. A static
-read-only node is not connected to safekeepers, so it cannot use the
-standby_horizon mechanism.
-
-
-### Synchronous replication
-
-We haven't put any effort into synchronous replication yet.
-
-PostgreSQL provides multiple levels of synchronicity. In the weaker
-levels, a transaction is not acknowledged as committed to the client
-in the primary until the WAL has been streamed to a replica or flushed
-to disk there. Those modes don't make senses in Neon, because the
-safekeepers handle durability.
-
-`synchronous_commit=remote_apply` mode would make sense. In that mode,
-the commit is not acknowledged to the client until it has been
-replayed in the replica. That ensures that after commit, you can see
-the commit in the replica too (aka. read-your-write consistency).
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.

    ```shell
-    git checkout -b my-branch-15 REL_15_STABLE_neon
+    git checkout -b my-branch REL_15_STABLE_neon
    ```

-1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
+1. Tag the last commit on the stable branch you are updating.

-1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.

    ```shell
    git fetch upstream REL_15_4
-    git merge REL_15_4
+    git rebase REL_15_4
    ```

-    In the commit message of the merge commit, mention if there were
-    any non-trivial conflicts or other issues.
-
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.

@@ -48,7 +57,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15
+    git push origin my-branch
    ```

 1. Clone the Neon repository if you have not done so already.
@@ -65,7 +74,7 @@ branch.
 1. Update the Git submodule.

    ```shell
-    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
+    git submodule set-branch --branch my-branch vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```

@@ -80,12 +89,14 @@ minor Postgres release.

 1. Create a pull request, and wait for CI to go green.

-1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
+1. Force push the rebased Postgres branches into the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15:REL_15_STABLE_neon
+    git push --force origin my-branch:REL_15_STABLE_neon
    ```

+    It may require disabling various branch protections.
+
 1. Update your Neon PR to point at the branches.

    ```shell
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,3 +14,5 @@ regex.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
+
+workspace_hack.workspace = true
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,8 +6,10 @@ license = "Apache-2.0"

 [dependencies]
 anyhow.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,3 +14,5 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
+
+workspace_hack.workspace = true
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,6 +12,8 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true

+workspace_hack.workspace = true
+
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,9 +21,11 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 itertools.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};

-use crate::models::PageserverUtilization;
 use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
@@ -141,11 +140,23 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-#[derive(Serialize, Clone, Debug)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(PageserverUtilization),
+    Active(UtilizationScore),
    // Node is warming up, but we expect it to become available soon. Covers
    // the time span between the re-attach response being composed on the storage controller
    // and the first successful heartbeat after the processing of the re-attach response
@@ -184,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
-            NodeAvailabilityWrapper::Active => {
-                NodeAvailability::Active(PageserverUtilization::full())
-            }
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
@@ -304,17 +313,20 @@ pub struct MetadataHealthUpdateRequest {
 pub struct MetadataHealthUpdateResponse {}

 #[derive(Serialize, Deserialize, Debug)]
+
 pub struct MetadataHealthListUnhealthyResponse {
    pub unhealthy_tenant_shards: Vec<TenantShardId>,
 }

 #[derive(Serialize, Deserialize, Debug)]
+
 pub struct MetadataHealthListOutdatedRequest {
    #[serde(with = "humantime_serde")]
    pub not_scrubbed_for: Duration,
 }

 #[derive(Serialize, Deserialize, Debug)]
+
 pub struct MetadataHealthListOutdatedResponse {
    pub health_records: Vec<MetadataHealthRecord>,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {

    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
    pub fn default_tenant_config() -> Self {
-        Self::V2
+        Self::V1
    }
 }

--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
    pub max_shard_count: u32,

    /// Cached result of [`Self::score`]
-    pub utilization_score: Option<u64>,
+    pub utilization_score: u64,

    /// When was this snapshot captured, pageserver local time.
    ///
@@ -50,8 +50,6 @@ fn unity_percent() -> Percent {
    Percent::new(0).unwrap()
 }

-pub type RawScore = u64;
-
 impl PageserverUtilization {
    const UTILIZATION_FULL: u64 = 1000000;

@@ -64,7 +62,7 @@ impl PageserverUtilization {
    /// - Negative values are forbidden
    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
    ///   layer eviction.
-    pub fn score(&self) -> RawScore {
+    pub fn score(&self) -> u64 {
        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
            * self.disk_usable_pct.get() as u64)
            / 100;
@@ -76,30 +74,8 @@ impl PageserverUtilization {
        std::cmp::max(disk_utilization_score, shard_utilization_score)
    }

-    pub fn cached_score(&mut self) -> RawScore {
-        match self.utilization_score {
-            None => {
-                let s = self.score();
-                self.utilization_score = Some(s);
-                s
-            }
-            Some(s) => s,
-        }
-    }
-
-    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
-    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
-    pub fn is_overloaded(score: RawScore) -> bool {
-        score >= Self::UTILIZATION_FULL
-    }
-
-    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
-        if self.shard_count < shard_count {
-            self.shard_count = shard_count;
-
-            // Dirty cache: this will be calculated next time someone retrives the score
-            self.utilization_score = None;
-        }
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
    }

    /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -112,38 +88,7 @@ impl PageserverUtilization {
            disk_usable_pct: Percent::new(100).unwrap(),
            shard_count: 1,
            max_shard_count: 1,
-            utilization_score: Some(Self::UTILIZATION_FULL),
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
-}
-
-/// Test helper
-pub mod test_utilization {
-    use super::PageserverUtilization;
-    use std::time::SystemTime;
-    use utils::{
-        serde_percent::Percent,
-        serde_system_time::{self},
-    };
-
-    // Parameters of the imaginary node used for test utilization instances
-    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
-    const TEST_SHARDS_MAX: u32 = 1000;
-
-    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
-    /// not abuse this function from non-test code.
-    ///
-    /// Emulates a node with a 1000 shard limit and a 1TB disk.
-    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
-        PageserverUtilization {
-            disk_usage_bytes: disk_wanted_bytes,
-            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
-            disk_wanted_bytes,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count,
-            max_shard_count: TEST_SHARDS_MAX,
-            utilization_score: None,
+            utilization_score: Self::UTILIZATION_FULL,
            captured_at: serde_system_time::SystemTime(SystemTime::now()),
        }
    }
@@ -175,7 +120,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            disk_wanted_bytes: u64::MAX,
-            utilization_score: Some(13),
+            utilization_score: 13,
            disk_usable_pct: Percent::new(90).unwrap(),
            shard_count: 100,
            max_shard_count: 200,
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
 tracing.workspace = true

 pq_proto.workspace = true
+workspace_hack.workspace = true

 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,5 +11,7 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,6 +19,8 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,6 +14,8 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,7 +11,9 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["io-util"] }
+tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-
+workspace_hack.workspace = true
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["test-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,48 +383,6 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let kind = RequestKind::Head;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
-
-        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
-        let properties_future = blob_client.get_properties().into_future();
-
-        let properties_future = tokio::time::timeout(self.timeout, properties_future);
-
-        let res = tokio::select! {
-            res = properties_future => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        if let Ok(inner) = &res {
-            // do not incl. timeouts as errors in metrics but cancellations
-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, inner, started_at);
-        }
-
-        let data = match res {
-            Ok(Ok(data)) => Ok(data),
-            Ok(Err(sdk)) => Err(to_download_error(sdk)),
-            Err(_timeout) => Err(DownloadError::Timeout),
-        }?;
-
-        let properties = data.blob.properties;
-        Ok(ListingObject {
-            key: key.to_owned(),
-            last_modified: SystemTime::from(properties.last_modified),
-            size: properties.content_length,
-        })
-    }
-
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
    NoDelimiter,
 }

-#[derive(PartialEq, Eq, Debug, Clone)]
+#[derive(PartialEq, Eq, Debug)]
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
@@ -215,13 +215,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

-    /// Obtain metadata information about an object.
-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError>;
-
    /// Streams the local file contents into remote into the remote storage entry.
    ///
    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -370,20 +363,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // See [`RemoteStorage::head_object`].
-    pub async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.head_object(key, cancel).await,
-            Self::AwsS3(s) => s.head_object(key, cancel).await,
-            Self::AzureBlob(s) => s.head_object(key, cancel).await,
-            Self::Unreliable(s) => s.head_object(key, cancel).await,
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -619,7 +598,6 @@ impl ConcurrencyLimiter {
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
-            RequestKind::Head => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,20 +445,6 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        _cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let target_file_path = key.with_base(&self.storage_root);
-        let metadata = file_metadata(&target_file_path).await?;
-        Ok(ListingObject {
-            key: key.clone(),
-            last_modified: metadata.modified()?,
-            size: metadata.len(),
-        })
-    }
-
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,7 +13,6 @@ pub(crate) enum RequestKind {
    List = 3,
    Copy = 4,
    TimeTravel = 5,
-    Head = 6,
 }

 use scopeguard::ScopeGuard;
@@ -28,7 +27,6 @@ impl RequestKind {
            List => "list_objects",
            Copy => "copy_object",
            TimeTravel => "time_travel_recover",
-            Head => "head_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -36,8 +34,7 @@ impl RequestKind {
    }
 }

-const REQUEST_KIND_COUNT: usize = 7;
-pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
+pub(crate) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
    pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -46,8 +43,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
-        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
-    operation::{get_object::GetObjectError, head_object::HeadObjectError},
+    operation::get_object::GetObjectError,
    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
@@ -604,78 +604,6 @@ impl RemoteStorage for S3Bucket {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let kind = RequestKind::Head;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
-
-        let head_future = self
-            .client
-            .head_object()
-            .bucket(self.bucket_name())
-            .key(self.relative_path_to_s3_object(key))
-            .send();
-
-        let head_future = tokio::time::timeout(self.timeout, head_future);
-
-        let res = tokio::select! {
-            res = head_future => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let res = res.map_err(|_e| DownloadError::Timeout)?;
-
-        // do not incl. timeouts as errors in metrics but cancellations
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        let data = match res {
-            Ok(object_output) => object_output,
-            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
-                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
-                // an error: we expect to sometimes fetch an object and find it missing,
-                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
-                return Err(DownloadError::NotFound);
-            }
-            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Err,
-                    started_at,
-                );
-
-                return Err(DownloadError::Other(
-                    anyhow::Error::new(e).context("s3 head object"),
-                ));
-            }
-        };
-
-        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
-            return Err(DownloadError::Other(anyhow!(
-                "head_object doesn't contain last_modified or content_length"
-            )))?;
-        };
-        Ok(ListingObject {
-            key: key.to_owned(),
-            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
-                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
-            })?,
-            size: size as u64,
-        })
-    }
-
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,7 +30,6 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
    ListPrefixes(Option<RemotePath>),
-    HeadObject(RemotePath),
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
@@ -138,16 +137,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<crate::ListingObject, DownloadError> {
-        self.attempt(RemoteOp::HeadObject(key.clone()))
-            .map_err(DownloadError::Other)?;
-        self.inner.head_object(key, cancel).await
-    }
-
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,3 +9,5 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,3 +9,5 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit = { workspace = true, features = ["serde"] }
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,6 +54,7 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
+workspace_hack.workspace = true

 const_format.workspace = true

@@ -70,7 +71,6 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
-tokio = { workspace = true, features = ["test-util"] }

 [[bench]]
 name = "benchmarks"
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,6 +9,8 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true

+workspace_hack.workspace = true
+
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("ERROR")
        .allowlist_var("FATAL")
        .allowlist_var("PANIC")
-        .allowlist_var("PG_VERSION_NUM")
        .allowlist_var("WPEVENT")
        .allowlist_var("WL_LATCH_SET")
        .allowlist_var("WL_SOCKET_READABLE")
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,11 +282,7 @@ mod tests {
    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;

-    use crate::{
-        api_bindings::Level,
-        bindings::{NeonWALReadResult, PG_VERSION_NUM},
-        walproposer::Wrapper,
-    };
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};

    use super::ApiImpl;

@@ -493,79 +489,41 @@ mod tests {

        let (sender, receiver) = sync_channel(1);

-        // Messages definitions are at walproposer.h
-        // xxx: it would be better to extract them from safekeeper crate and
-        // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
-        let proposer_greeting = [
-            greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
-            tenant_id.as_slice(),
-            timeline_id.as_slice(),
-            pg_tli.as_slice(),
-            wal_seg_size.as_slice(),
-        ]
-        .concat();
-
-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let vote_request = [
-            voting_tag.as_slice(),
-            vote_request_term.as_slice(),
-            proposer_id.as_slice(),
-        ]
-        .concat();
-
-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
-        let acceptor_greeting = [
-            greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
-            acceptor_greeting_node_id.as_slice(),
-        ]
-        .concat();
-
-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
-        let vote_response = [
-            voting_tag.as_slice(),
-            vote_response_term.as_slice(),
-            vote_given.as_slice(),
-            flush_lsn.as_slice(),
-            truncate_lsn.as_slice(),
-            th_len.as_slice(),
-            th_term.as_slice(),
-            th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
-        ]
-        .concat();
-
        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
            wait_events: Cell::new(WaitEventsData {
                sk: std::ptr::null_mut(),
                event_mask: 0,
            }),
-            expected_messages: vec![proposer_greeting, vote_request],
+            expected_messages: vec![
+                // TODO: When updating Postgres versions, this test will cause
+                // problems. Postgres version in message needs updating.
+                //
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![acceptor_greeting, vote_response],
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,13 +4,12 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
    context::{DownloadBehavior, RequestContext},
    l0_flush::{L0FlushConfig, L0FlushGlobalState},
    page_cache,
    repository::Value,
    task_mgr::TaskKind,
-    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
@@ -68,16 +67,12 @@ async fn ingest(
    let layer =
        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;

-    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
-    let data_ser_size = data.serialized_size().unwrap() as usize;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
    );

-    const BATCH_SIZE: usize = 16;
-    let mut batch = Vec::new();
-
    for i in 0..put_count {
        lsn += put_size as u64;

@@ -100,17 +95,7 @@ async fn ingest(
            }
        }

-        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
-        if batch.len() >= BATCH_SIZE {
-            let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch);
-            layer.put_batch(serialized, &ctx).await?;
-        }
-    }
-    if !batch.is_empty() {
-        let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch);
-        layer.put_batch(serialized, &ctx).await?;
+        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

@@ -164,11 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
    page_cache::init(conf.page_cache_size);

    {
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -506,16 +506,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    /// Configs io buffer alignment at runtime.
-    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
-        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, align)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@

 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -145,11 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -60,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
@@ -190,11 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,7 +20,6 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
    context::{DownloadBehavior, RequestContext},
    page_cache,
    task_mgr::TaskKind,
@@ -206,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -58,11 +58,6 @@ pub(crate) struct Args {
    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
    #[clap(long)]
    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
-
-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
-    #[clap(long)]
-    set_io_alignment: Option<usize>,
-
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -129,10 +124,6 @@ async fn main_impl(
        mgmt_api_client.put_io_engine(engine_str).await?;
    }

-    if let Some(align) = args.set_io_alignment {
-        mgmt_api_client.put_io_alignment(align).await?;
-    }
-
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
-    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -137,11 +136,7 @@ fn main() -> anyhow::Result<()> {
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.io_buffer_alignment,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -50,6 +50,7 @@ pub mod defaults {
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
+    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -89,14 +90,13 @@ pub mod defaults {

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

-    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Disabled;

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -291,8 +291,6 @@ pub struct PageServerConf {

    /// Direct IO settings
    pub virtual_file_direct_io: virtual_file::DirectIoMode,
-
-    pub io_buffer_alignment: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -397,8 +395,6 @@ struct PageServerConfigBuilder {
    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,

    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
-
-    io_buffer_alignment: BuilderValue<usize>,
 }

 impl PageServerConfigBuilder {
@@ -482,12 +478,11 @@ impl PageServerConfigBuilder {
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
-            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
        }
    }
 }
@@ -667,10 +662,6 @@ impl PageServerConfigBuilder {
        self.virtual_file_direct_io = BuilderValue::Set(value);
    }

-    pub fn io_buffer_alignment(&mut self, value: usize) {
-        self.io_buffer_alignment = BuilderValue::Set(value);
-    }
-
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -727,7 +718,6 @@ impl PageServerConfigBuilder {
                l0_flush,
                compact_level0_phase1_value_access,
                virtual_file_direct_io,
-                io_buffer_alignment,
            }
            CUSTOM LOGIC
            {
@@ -997,9 +987,6 @@ impl PageServerConf {
                "virtual_file_direct_io" => {
                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
                }
-                "io_buffer_alignment" => {
-                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1078,12 +1065,11 @@ impl PageServerConf {
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
        }
    }
 }
@@ -1319,12 +1305,11 @@ background_task_maximum_delay = '334 s'
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1393,12 +1378,11 @@ background_task_maximum_delay = '334 s'
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1706,6 +1706,11 @@ async fn timeline_compact_handler(
        flags |= CompactFlags::ForceImageLayerCreation;
    }
    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
    }
    let wait_until_uploaded =
@@ -2325,20 +2330,6 @@ async fn put_io_engine_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_alignment_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let align: usize = json_request(&mut r).await?;
-    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
-        ApiError::PreconditionFailed(
-            format!("Requested io alignment ({align}) is not a power of two").into(),
-        )
-    })?;
-    json_response(StatusCode::OK, ())
-}
-
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -2951,7 +2942,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| api_handler(r, timeline_compact_handler),
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
@@ -3026,9 +3017,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put("/v1/io_alignment", |r| {
-            api_handler(r, put_io_alignment_handler)
-        })
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,10 +1,15 @@
 use std::{num::NonZeroUsize, sync::Arc};

+use crate::tenant::ephemeral_file;
+
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
+    Direct {
+        max_concurrency: NonZeroUsize,
+    },
 }

 impl Default for L0FlushConfig {
@@ -20,12 +25,14 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);

 pub enum Inner {
+    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }

 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
@@ -37,3 +44,13 @@ impl L0FlushGlobalState {
        &self.0
    }
 }
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -88,8 +88,6 @@ pub async fn shutdown_pageserver(
 ) {
    use std::time::Duration;

-    let started_at = std::time::Instant::now();
-
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
@@ -243,10 +241,7 @@ pub async fn shutdown_pageserver(
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");

-    info!(
-        elapsed_ms = started_at.elapsed().as_millis(),
-        "Shut down successfully completed"
-    );
+    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,23 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_utilization_score",
-        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_secondary_heatmap_total_size",
-        "The total size in bytes of all layers in the most recently downloaded heatmap.",
-        &["tenant_id", "shard_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1870,64 +1853,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub struct BackgroundLoopSemaphoreMetrics {
-    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
-}
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
-        let counters = register_int_counter_pair_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap();
-
-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
-            &["task"],
-        )
-        .unwrap();
-
-        BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-                counters.with_label_values(&[kind.into()])
-            })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-                durations.with_label_values(&[kind.into()])
-            })),
-        }
-    },
-);
-
-impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
-            task,
-            _counter_guard: self.counters[task].guard(),
-            start: Instant::now(),
-        }
-    }
-}
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_background_loop_semaphore_wait_start_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls started",
+        "pageserver_background_loop_semaphore_wait_finish_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+        &["task"],
+    )
+    .unwrap()
+});

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -2609,7 +2544,6 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;

 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,11 +15,12 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -172,7 +174,6 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
            lsn,
        }
    }
@@ -726,17 +727,7 @@ impl Timeline {
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
            Some(AuxFilePolicy::CrossValidation) => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1031,33 +1022,21 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
-
-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
-    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
-    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
-    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
-    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
-
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

-    pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
-    }
-
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1597,7 +1576,6 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1791,25 +1769,21 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
+            for (lsn, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
+                    writer.put(key, lsn, &value, ctx).await?;
                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
                }
            }
-            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
-        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1835,20 +1809,17 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
-                .drain()
-                .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        (key.to_compact(), lsn, val_ser_size, value)
-                    })
-                })
-                .collect::<Vec<_>>();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );

-            writer.put_batch(batch, ctx).await?;
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1873,8 +1844,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        self.pending_bytes = 0;
-
        Ok(())
    }

@@ -1891,7 +1860,7 @@ impl<'a> DatadirModification<'a> {
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
+            if let Some((_, value)) = values.last() {
                return if let Value::Image(img) = value {
                    Ok(img.clone())
                } else {
@@ -1919,17 +1888,13 @@ impl<'a> DatadirModification<'a> {
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
-                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                *last_value = val;
                return;
            }
        }
-
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-        self.pending_bytes += val_serialized_size;
-        values.push((self.lsn, val_serialized_size, val));
+        values.push((self.lsn, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
@@ -2059,7 +2024,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: TenantShardId,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_shard_id: TenantShardId,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
    future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
+                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
@@ -573,8 +573,13 @@ pub async fn shutdown_tasks(
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                // warn to catch these in tests; there shouldn't be any
-                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                if tenant_shard_id.is_none() {
+                    // there are quite few of these
+                    info!(name = task.name, kind = ?task_kind, "stopping global task");
+                } else {
+                    // warn to catch these in tests; there shouldn't be any
+                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                }
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                .await
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
-            tenant_shard_id,
+            Some(tenant_shard_id),
            None,
            "attach tenant",
            async move {
@@ -3741,21 +3741,13 @@ impl Tenant {
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
-    ///
-    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
-    /// than they report here, due to layer eviction.  Tenants with many active branches may
-    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
        let timelines = self.timelines.lock().unwrap();
-
-        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
-        // reflects the observation that on tenants with multiple large branches, typically only one
-        // of them is used actively enough to occupy space on disk.
-        timelines
-            .values()
-            .map(|t| t.metrics.visible_physical_size_gauge.get())
-            .max()
-            .unwrap_or(0)
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
    }
 }

@@ -5940,10 +5932,10 @@ mod tests {
            .await
            .unwrap();

-        // the default aux file policy to switch is v2 if not set by the admins
+        // the default aux file policy to switch is v1 if not set by the admins
        assert_eq!(
            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
+            AuxFilePolicy::V1
        );
        let (tenant, ctx) = harness.load().await;

@@ -5987,8 +5979,8 @@ mod tests {
        );
        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
        );

        // we can read everything from the storage
@@ -6010,8 +6002,8 @@ mod tests {

        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
        );

        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6027,7 +6019,7 @@ mod tests {

        // child copies the last flag even if that is not on remote storage yet
        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));

        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(files.get("pg_logical/mappings/test1"), None);
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }

 mod page_caching;
+pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;

 impl EphemeralFile {
@@ -51,10 +52,12 @@ impl EphemeralFile {
        )
        .await?;

+        let prewarm = conf.l0_flush.prewarm_on_write();
+
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, gate_guard),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
        })
    }

@@ -79,8 +82,6 @@ impl EphemeralFile {
        self.rw.read_blk(blknum, ctx).await
    }

-    #[cfg(test)]
-    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
@@ -88,30 +89,17 @@ impl EphemeralFile {
    ) -> Result<u64, io::Error> {
        let pos = self.rw.bytes_written();

-        let mut len_bytes = std::io::Cursor::new(Vec::new());
-        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
-            srcbuf.len(),
-            &mut len_bytes,
-        );
-        let len_bytes = len_bytes.into_inner();
-
        // Write the length field
-        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+        if srcbuf.len() < 0x80 {
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];

-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
-    }
-
-    /// Returns the offset at which the first byte of the input was written, for use
-    /// in constructing indices over the written value.
-    pub(crate) async fn write_raw(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        } else {
+            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
+            len_buf[0] |= 0x80;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        }

        // Write the payload
        self.rw.write_all_borrowed(srcbuf, ctx).await?;
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-//!
-//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>

 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
 use crate::virtual_file::VirtualFile;

-use std::io::{self};
+use once_cell::sync::Lazy;
+use std::io::{self, ErrorKind};
+use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;

@@ -18,17 +18,33 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
+    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
    _gate_guard: utils::sync::gate::GateGuard,
 }

+/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
+/// should we pre-warm the [`crate::page_cache`] with the contents?
+#[derive(Clone, Copy)]
+pub enum PrewarmOnWrite {
+    Yes,
+    No,
+}
+
 impl RW {
-    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
+            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
+                page_cache_file_id,
+                file,
+                prewarm_on_write,
+            )),
            _gate_guard,
        }
    }
@@ -68,10 +84,10 @@ impl RW {
        let vec = Vec::with_capacity(size);

        // read from disk what we've already flushed
-        let file_size_tracking_writer = self.rw.as_writer();
-        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
-        let mut vec = file_size_tracking_writer
-            .as_inner()
+        let writer = self.rw.as_writer();
+        let flushed_range = writer.written_range();
+        let mut vec = writer
+            .file
            .read_exact_at(
                vec.slice(0..(flushed_range.end - flushed_range.start)),
                u64::try_from(flushed_range.start).unwrap(),
@@ -106,7 +122,7 @@ impl RW {
                            format!(
                                "ephemeral file: read immutable page #{}: {}: {:#}",
                                blknum,
-                                self.rw.as_writer().as_inner().path,
+                                self.rw.as_writer().file.path,
                                e,
                            ),
                        )
@@ -116,7 +132,7 @@ impl RW {
                    }
                    page_cache::ReadBufResult::NotFound(write_guard) => {
                        let write_guard = writer
-                            .as_inner()
+                            .file
                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                            .await?;
                        let read_guard = write_guard.mark_valid();
@@ -138,16 +154,137 @@ impl Drop for RW {

        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = &self.rw.as_writer().as_inner().path;
-        let res = std::fs::remove_file(path);
+        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
                // just never log the not found errors, we cannot do anything for them; on detach
                // the tenant directory is already gone.
                //
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.rw.as_writer().file.path,
+                    e
+                );
            }
        }
    }
 }
+
+struct PreWarmingWriter {
+    prewarm_on_write: PrewarmOnWrite,
+    nwritten_blocks: u32,
+    page_cache_file_id: page_cache::FileId,
+    file: VirtualFile,
+}
+
+impl PreWarmingWriter {
+    fn new(
+        page_cache_file_id: page_cache::FileId,
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+    ) -> Self {
+        Self {
+            prewarm_on_write,
+            nwritten_blocks: 0,
+            page_cache_file_id,
+            file,
+        }
+    }
+
+    /// Return the byte range within `file` that has been written though `write_all`.
+    ///
+    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
+    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
+        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
+        struct Wrapper(Range<usize>);
+        impl Deref for Wrapper {
+            type Target = Range<usize>;
+            fn deref(&self) -> &Range<usize> {
+                &self.0
+            }
+        }
+        Wrapper(0..nwritten_blocks * PAGE_SZ)
+    }
+}
+
+impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
+    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
+        let buflen = buf.len();
+        assert_eq!(
+            buflen % PAGE_SZ,
+            0,
+            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
+        );
+
+        // Do the IO.
+        let buf = match self.file.write_all(buf, ctx).await {
+            (buf, Ok(nwritten)) => {
+                assert_eq!(nwritten, buflen);
+                buf
+            }
+            (_, Err(e)) => {
+                return Err(std::io::Error::new(
+                    ErrorKind::Other,
+                    // order error before path because path is long and error is short
+                    format!(
+                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
+                        self.nwritten_blocks, buflen, e, self.file.path,
+                    ),
+                ));
+            }
+        };
+
+        let nblocks = buflen / PAGE_SZ;
+        let nblocks32 = u32::try_from(nblocks).unwrap();
+
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
+            // Pre-warm page cache with the contents.
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+            // benefits the code that writes InMemoryLayer=>L0 layers.
+
+            let cache = page_cache::get();
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
+                RequestContext::new(
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                    crate::context::DownloadBehavior::Error,
+                )
+            });
+            for blknum_in_buffer in 0..nblocks {
+                let blk_in_buffer =
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                let blknum = self
+                    .nwritten_blocks
+                    .checked_add(blknum_in_buffer as u32)
+                    .unwrap();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                    .await
+                {
+                    Err(e) => {
+                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                    }
+                    Ok(v) => match v {
+                        page_cache::ReadBufResult::Found(_guard) => {
+                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                        }
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                            write_guard.copy_from_slice(blk_in_buffer);
+                            let _ = write_guard.mark_valid();
+                        }
+                    },
+                }
+            }
+        }
+
+        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
+        Ok((buflen, buf))
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -483,7 +483,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
    }

    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
-        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>) -> bool {
+        key_range == &(Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range, layer.is_delta) {
+        if !Self::is_l0(&layer.key_range) {
            return true;
        }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
-            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 16, // pg_version (4 bytes)
+            0, 0, 0, 15, // pg_version (4 bytes)
            /* padding bytes */
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                self.tenant_shard_id,
+                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
                async move {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,7 +8,6 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
-    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };

@@ -106,9 +105,6 @@ pub(crate) struct SecondaryTenant {

    // Sum of layer sizes on local disk
    pub(super) resident_size_metric: UIntGauge,
-
-    // Sum of layer sizes in the most recently downloaded heatmap
-    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl Drop for SecondaryTenant {
@@ -116,7 +112,6 @@ impl Drop for SecondaryTenant {
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
    }
 }

@@ -133,10 +128,6 @@ impl SecondaryTenant {
            .get_metric_with_label_values(&[&tenant_id, &shard_id])
            .unwrap();

-        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id])
-            .unwrap();
-
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -154,7 +145,6 @@ impl SecondaryTenant {
            progress: std::sync::Mutex::default(),

            resident_size_metric,
-            heatmap_total_size_metric,
        })
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,12 +829,6 @@ impl<'a> TenantDownloader<'a> {
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
-
-        // Also expose heatmap bytes_total as a metric
-        self.secondary_state
-            .heatmap_total_size_metric
-            .set(heatmap_stats.bytes);
-
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTimeline {
    #[serde_as(as = "DisplayFromStr")]
-    pub(crate) timeline_id: TimelineId,
+    pub(super) timeline_id: TimelineId,

-    pub(crate) layers: Vec<HeatMapLayer>,
+    pub(super) layers: Vec<HeatMapLayer>,
 }

 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(crate) name: LayerName,
-    pub(crate) metadata: LayerFileMetadata,
+    pub(super) name: LayerName,
+    pub(super) metadata: LayerFileMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@

 pub mod delta_layer;
 pub mod image_layer;
-pub mod inmemory_layer;
+pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadCoalesceMode, VectoredReadPlanner,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
@@ -232,18 +232,6 @@ pub struct DeltaLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl DeltaLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "delta {}..{} {}..{}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn_range().start,
-            self.lsn_range().end
-        )
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -1207,7 +1195,6 @@ impl DeltaLayerInner {
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

        let mut read_builder: Option<VectoredReadBuilder> = None;
-        let read_mode = VectoredReadCoalesceMode::get();

        let max_read_size = self
            .max_vectored_read_bytes
@@ -1256,7 +1243,6 @@ impl DeltaLayerInner {
                        offsets.end.pos(),
                        meta,
                        max_read_size,
-                        read_mode,
                    ))
                }
            } else {
@@ -1541,10 +1527,6 @@ pub struct DeltaLayerIterator<'a> {
 }

 impl<'a> DeltaLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.delta_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
@@ -2285,7 +2267,7 @@ pub(crate) mod test {
            .await
            .unwrap();
        let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                // Test if the batch size is correctly determined
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,17 +167,6 @@ pub struct ImageLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl ImageLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "image {}..{} {}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn()
-        )
-    }
-}
-
 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
@@ -1035,10 +1024,6 @@ pub struct ImageLayerIterator<'a> {
 }

 impl<'a> ImageLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.image_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
@@ -1376,7 +1361,7 @@ mod test {
                .await
                .unwrap();
        let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                // Test if the batch size is correctly determined
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -249,7 +249,9 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        let inner = self.inner.read().await;
+
        let end_str = self.end_lsn_or_max();

        println!(
@@ -257,6 +259,39 @@ impl InMemoryLayer {
            self.timeline_id, self.start_lsn, end_str,
        );

+        if !verbose {
+            return Ok(());
+        }
+
+        let cursor = inner.file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, vec_map) in inner.index.iter() {
+            for (lsn, pos) in vec_map.as_slice() {
+                let mut desc = String::new();
+                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                let val = Value::des(&buf);
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                    }
+                }
+                println!("  key {} at {}: {}", key, lsn, desc);
+            }
+        }
+
        Ok(())
    }

@@ -320,82 +355,6 @@ impl InMemoryLayer {
    }
 }

-/// Offset of a particular Value within a serialized batch.
-struct SerializedBatchOffset {
-    key: CompactKey,
-    lsn: Lsn,
-    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
-    offset: u64,
-}
-
-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    offsets: Vec<SerializedBatchOffset>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    /// Write a blob length in the internal format of the EphemeralFile
-    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
-        use std::io::Write;
-
-        if len < 0x80 {
-            // short one-byte length header
-            let len_buf = [len as u8];
-
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        } else {
-            let mut len_buf = u32::to_be_bytes(len as u32);
-            len_buf[0] |= 0x80;
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        }
-    }
-
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
-        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
-        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
-
-        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        for (key, lsn, val_ser_size, val) in batch {
-            let relative_off = cursor.position();
-
-            Self::write_blob_length(val_ser_size, &mut cursor);
-            val.ser_into(&mut cursor)
-                .expect("Writing into in-memory buffer is infallible");
-
-            offsets.push(SerializedBatchOffset {
-                key,
-                lsn,
-                offset: relative_off,
-            });
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        let buffer = cursor.into_inner();
-
-        // Assert that we didn't do any extra allocations while building buffer.
-        debug_assert!(buffer.len() <= buffer_size);
-
-        Self {
-            raw: buffer,
-            offsets,
-            max_lsn,
-        }
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -456,20 +415,37 @@ impl InMemoryLayer {
        })
    }

-    // Write path.
-    pub async fn put_batch(
+    // Write operations
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    pub async fn put_value(
        &self,
-        serialized_batch: SerializedBatch,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+    }

-        let base_off = {
-            inner
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+
+        let off = {
+            locked_inner
                .file
-                .write_raw(
-                    &serialized_batch.raw,
+                .write_blob(
+                    buf,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -477,23 +453,15 @@ impl InMemoryLayer {
                .await?
        };

-        for SerializedBatchOffset {
-            key,
-            lsn,
-            offset: relative_off,
-        } in serialized_batch.offsets
-        {
-            let off = base_off + relative_off;
-            let vec_map = inner.index.entry(key).or_default();
-            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-            if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
-            }
+        let vec_map = locked_inner.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -568,6 +536,7 @@ impl InMemoryLayer {

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
+            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };

@@ -599,6 +568,34 @@ impl InMemoryLayer {
        .await?;

        match l0_flush_global_state {
+            l0_flush::Inner::PageCached => {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build();
+
+                let mut buf = Vec::new();
+
+                let cursor = inner.file.block_cursor();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                &ctx,
+                            )
+                            .await;
+                        res?;
+                        buf = tmp.into_raw_slice().into_inner();
+                    }
+                }
+            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                assert_eq!(
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1296,10 +1296,7 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(
-                    &self.layer_desc().key_range,
-                    self.layer_desc().is_delta,
-                ),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,10 +256,6 @@ impl LayerName {
            LayerName::Delta(layer) => &layer.key_range,
        }
    }
-
-    pub fn is_delta(&self) -> bool {
-        matches!(self, LayerName::Delta(_))
-    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,7 +3,6 @@ use std::{
    collections::{binary_heap, BinaryHeap},
 };

-use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;

@@ -27,13 +26,6 @@ impl<'a> LayerRef<'a> {
            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 enum LayerIterRef<'a> {
@@ -48,13 +40,6 @@ impl LayerIterRef<'_> {
            Self::Image(x) => x.next().await,
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 /// This type plays several roles at once
@@ -90,11 +75,6 @@ impl<'a> PeekableLayerIterRef<'a> {
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let result = self.peeked.take();
        self.peeked = self.iter.next().await?;
-        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
-            if (k1, l1) < (k2, l2) {
-                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
-            }
-        }
        Ok(result)
    }
 }
@@ -198,12 +178,7 @@ impl<'a> IteratorWrapper<'a> {
        let iter = PeekableLayerIterRef::create(iter).await?;
        if let Some((k1, l1, _)) = iter.peek() {
            let (k2, l2) = first_key_lower_bound;
-            if (k1, l1) < (k2, l2) {
-                bail!(
-                    "layer key range did not include the first key in the layer: {}",
-                    layer.layer_dbg_info()
-                );
-            }
+            debug_assert!((k1, l1) >= (k2, l2));
        }
        *self = Self::Loaded { iter };
        Ok(())
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -208,8 +208,6 @@ impl SplitDeltaLayerWriter {

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
@@ -231,10 +229,7 @@ mod tests {
    }

    fn get_large_img() -> Bytes {
-        let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
-        let mut data = vec![0; 8192];
-        rng.fill_bytes(&mut data);
-        data.into()
+        vec![0; 8192].into()
    }

    #[tokio::test]
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,12 +61,21 @@ impl BackgroundLoopKind {
    }
 }

+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
+    let _guard = PERMIT_GAUGES[loop_kind].guard();

    pausable_failpoint!(
        "initial-size-calculation-permit-pause",
@@ -89,7 +98,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
        {
@@ -112,7 +121,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
        {
@@ -135,7 +144,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::IngestHousekeeping,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
        {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -44,8 +44,10 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
+    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -135,10 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{
-    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
-    upload_queue::NotInitialized,
-};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -1646,20 +1645,6 @@ impl Timeline {
        self.last_record_lsn.shutdown();

        if try_freeze_and_flush {
-            if let Some((open, frozen)) = self
-                .layers
-                .read()
-                .await
-                .layer_map()
-                .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
-                .ok()
-                .filter(|(open, frozen)| *open || *frozen > 0)
-            {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
-            } else {
-                // this is double-shutdown, ignore it
-            }
-
            // we shut down walreceiver above, so, we won't add anything more
            // to the InMemoryLayer; freeze it and wait for all frozen layers
            // to reach the disk & upload queue, then shut the upload queue and
@@ -2234,11 +2219,6 @@ impl Timeline {

                handles: Default::default(),
            };
-
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
-            }
-
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

@@ -2287,7 +2267,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "layer flush task",
            async move {
@@ -2641,7 +2621,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "initial size calculation",
            // NB: don't log errors here, task_mgr will do that.
@@ -2809,7 +2789,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "ondemand logical size calculation",
            async move {
@@ -2983,7 +2963,11 @@ impl Timeline {
                LayerVisibilityHint::Visible => {
                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                    let last_activity_ts = layer.latest_activity();
-                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
                }
                LayerVisibilityHint::Covered => {
                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2992,26 +2976,7 @@ impl Timeline {
            }
        });

-        let mut layers = resident.collect::<Vec<_>>();
-
-        // Sort layers in order of which to download first.  For a large set of layers to download, we
-        // want to prioritize those layers which are most likely to still be in the resident many minutes
-        // or hours later:
-        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
-        //   only exist for a few minutes before being compacted into L1s.
-        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
-        //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((
-                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
-                desc.lsn_range.end,
-            ))
-        });
-
-        let layers = layers
-            .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
-            .collect();
+        let layers = resident.collect();

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
@@ -3598,6 +3563,34 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

+            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
+            // This code path will not be hit during regression tests. After #7099 we have a single partition
+            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
+            // to be fixed.
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                self.create_delta_layer(
+                    &frozen_layer,
+                    Some(
+                        metadata_keyspace.0.ranges.first().unwrap().start
+                            ..metadata_keyspace.0.ranges.last().unwrap().end,
+                    ),
+                    ctx,
+                )
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            } else {
+                None
+            };
+
+            // For image layers, we add them immediately into the layer map.
            let mut layers_to_upload = Vec::new();
            layers_to_upload.extend(
                self.create_image_layers(
@@ -3608,27 +3601,13 @@ impl Timeline {
                )
                .await?,
            );
-            if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
-            }

-            (layers_to_upload, None)
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
        } else {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
@@ -4038,6 +4017,8 @@ impl Timeline {
        mode: ImageLayerCreationMode,
        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
        let data = self
@@ -4203,13 +4184,15 @@ impl Timeline {
                        "metadata keys must be partitioned separately"
                    );
                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                }
                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                    // might mess up with evictions.
                    start = img_range.end;
                    continue;
                }
-                // For initial and force modes, we always generate image layers for metadata keys.
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4217,8 +4200,7 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            }
-            if let ImageLayerCreationMode::Force = mode {
+            } else if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
@@ -4232,7 +4214,6 @@ impl Timeline {
                        img_range.start,
                        img_range.end
                    );
-                    start = img_range.end;
                    continue;
                }
            }
@@ -4521,7 +4502,6 @@ impl DurationRecorder {
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
-#[derive(Clone)]
 pub struct DeltaLayerTestDesc {
    pub lsn_range: Range<Lsn>,
    pub key_range: Range<Key>,
@@ -4551,13 +4531,6 @@ impl DeltaLayerTestDesc {
            data,
        }
    }
-
-    pub(crate) fn layer_name(&self) -> LayerName {
-        LayerName::Delta(super::storage_layer::DeltaLayerName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        })
-    }
 }

 impl Timeline {
@@ -4588,7 +4561,7 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
            } else {
                insert_layers.push(l.clone());
@@ -5155,7 +5128,7 @@ impl Timeline {
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
@@ -5583,6 +5556,44 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
+        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -5688,58 +5699,18 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
+    ///
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(CompactKey, Lsn, usize, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
-            return Ok(());
+        for (lsn, (key, val)) in batch {
+            self.put(key, lsn, &val, ctx).await?
        }

-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
-
-        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
-        let layer = self
-            .handle_open_layer_action(batch_max_lsn, action, ctx)
-            .await?;
-
-        let res = layer.put_batch(serialized_batch, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(batch_max_lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
-        }
-
-        res
-    }
-
-    #[cfg(test)]
-    /// Test helper, for tests that would like to poke individual values without composing a batch
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        use utils::bin_ser::BeSer;
-        let val_ser_size = value.serialized_size().unwrap() as usize;
-        self.put_batch(
-            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
-            ctx,
-        )
-        .await
+        Ok(())
    }

    pub(crate) async fn delete_batch(
@@ -5783,110 +5754,12 @@ fn is_send() {

 #[cfg(test)]
 mod tests {
-    use pageserver_api::key::Key;
    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::{
-        repository::Value,
-        tenant::{
-            harness::{test_img, TenantHarness},
-            layer_map::LayerMap,
-            storage_layer::{Layer, LayerName},
-            timeline::{DeltaLayerTestDesc, EvictionError},
-            Timeline,
-        },
+    use crate::tenant::{
+        harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
    };

-    #[tokio::test]
-    async fn test_heatmap_generation() {
-        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
-
-        let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
-            Lsn(0x10)..Lsn(0x20),
-            vec![(
-                Key::from_hex("620000000033333333444444445500000000").unwrap(),
-                Lsn(0x11),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
-            Lsn(0x10)..Lsn(0x20),
-            vec![(
-                Key::from_hex("720000000033333333444444445500000000").unwrap(),
-                Lsn(0x11),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let l0_delta = DeltaLayerTestDesc::new(
-            Lsn(0x20)..Lsn(0x30),
-            Key::from_hex("000000000000000000000000000000000000").unwrap()
-                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
-            vec![(
-                Key::from_hex("720000000033333333444444445500000000").unwrap(),
-                Lsn(0x25),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let delta_layers = vec![
-            covered_delta.clone(),
-            visible_delta.clone(),
-            l0_delta.clone(),
-        ];
-
-        let image_layer = (
-            Lsn(0x40),
-            vec![(
-                Key::from_hex("620000000033333333444444445500000000").unwrap(),
-                test_img("bar"),
-            )],
-        );
-        let image_layers = vec![image_layer];
-
-        let (tenant, ctx) = harness.load().await;
-        let timeline = tenant
-            .create_test_timeline_with_layers(
-                TimelineId::generate(),
-                Lsn(0x10),
-                14,
-                &ctx,
-                delta_layers,
-                image_layers,
-                Lsn(0x100),
-            )
-            .await
-            .unwrap();
-
-        // Layer visibility is an input to heatmap generation, so refresh it first
-        timeline.update_layer_visibility().await.unwrap();
-
-        let heatmap = timeline
-            .generate_heatmap()
-            .await
-            .expect("Infallible while timeline is not shut down");
-
-        assert_eq!(heatmap.timeline_id, timeline.timeline_id);
-
-        // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
-
-        let mut last_lsn = Lsn::MAX;
-        for layer in heatmap.layers {
-            // Covered layer should be omitted
-            assert!(layer.name != covered_delta.layer_name());
-
-            let layer_lsn = match &layer.name {
-                LayerName::Delta(d) => d.lsn_range.end,
-                LayerName::Image(i) => i.lsn,
-            };
-
-            // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
-                assert!(layer_lsn <= last_lsn);
-                last_lsn = layer_lsn;
-            }
-        }
-    }
-
    #[tokio::test]
    async fn two_layer_eviction_attempts_at_the_same_time() {
        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -855,11 +855,21 @@ impl Timeline {
                merge_iter: MergeIterator<'a>,
            },
            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
+                what: ValidationWhat,
+                concurrency: ValidationIoConcurrency,
                merge_iter: MergeIterator<'a>,
                all_keys_iter: VecIter<'a>,
            },
        }
+        enum ValidationIoConcurrency {
+            Sequential,
+            Concurrent,
+        }
+        enum ValidationWhat {
+            Nothing,
+            KeyLsn,
+            KeyLsnValue,
+        }
        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
        impl AllValuesIter<'_> {
            async fn next_all_keys_iter(
@@ -887,10 +897,18 @@ impl Timeline {
                      Self::next_all_keys_iter(iter, ctx).await
                    }
                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
+                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { what, concurrency, merge_iter, all_keys_iter } => async {
+                        // advance both iterators. Use concurrency but no parallelism.
+                        let all_keys_iter_item_fut = Self::next_all_keys_iter(all_keys_iter, ctx);
+                        let merge_iter_item_fut = merge_iter.next();
+                        let (all_keys_iter_item, merge_iter_item) = match concurrency {
+                            ValidationIoConcurrency::Sequential => {
+                                (all_keys_iter_item_fut.await, merge_iter_item_fut.await)
+                            },
+                            ValidationIoConcurrency::Concurrent => {
+                                futures::future::join(all_keys_iter_item_fut, merge_iter_item_fut).await
+                            },
+                        };
                        // compare results & log warnings as needed
                        macro_rules! rate_limited_warn {
                            ($($arg:tt)*) => {{
@@ -928,16 +946,17 @@ impl Timeline {
                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
                            }
                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
+                                match what {
+                                    ValidationWhat::Nothing => { }
                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
+                                    ValidationWhat::KeyLsn => {
                                        let all_keys = (all_keys_key, all_keys_lsn);
                                        let merge = (merge_key, merge_lsn);
                                        if all_keys != merge {
                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
                                        }
                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
+                                    ValidationWhat::KeyLsnValue => {
                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
                                        let merge = (merge_key, merge_lsn, merge_value);
                                        if all_keys != merge {
@@ -949,7 +968,7 @@ impl Timeline {
                        }
                        // in case of mismatch, trust the legacy all_keys_iter_item
                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
+                    }.await
                }
            }
        }
@@ -969,7 +988,32 @@ impl Timeline {
                match validate {
                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
+                        what: match &validate {
+                            CompactL0BypassPageCacheValidation::JustReadBoth
+                            | CompactL0BypassPageCacheValidation::JustReadBothConcurrentIo => {
+                                ValidationWhat::Nothing
+                            }
+                            CompactL0BypassPageCacheValidation::KeyLsn
+                            | CompactL0BypassPageCacheValidation::KeyLsnConcurrentIo => {
+                                ValidationWhat::KeyLsn
+                            }
+                            CompactL0BypassPageCacheValidation::KeyLsnValue
+                            | CompactL0BypassPageCacheValidation::KeyLsnValueConcurrentIo => {
+                                ValidationWhat::KeyLsnValue
+                            }
+                        },
+                        concurrency: match validate {
+                            CompactL0BypassPageCacheValidation::JustReadBothConcurrentIo
+                            | CompactL0BypassPageCacheValidation::KeyLsnConcurrentIo
+                            | CompactL0BypassPageCacheValidation::KeyLsnValueConcurrentIo => {
+                                ValidationIoConcurrency::Concurrent
+                            }
+                            CompactL0BypassPageCacheValidation::JustReadBoth
+                            | CompactL0BypassPageCacheValidation::KeyLsn
+                            | CompactL0BypassPageCacheValidation::KeyLsnValue => {
+                                ValidationIoConcurrency::Sequential
+                            }
+                        },
                        merge_iter,
                        all_keys_iter: all_keys.iter(),
                    },
@@ -1389,11 +1433,18 @@ pub enum CompactL0Phase1ValueAccess {
 /// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
 #[serde(rename_all = "kebab-case")]
+#[allow(clippy::enum_variant_names)]
 pub enum CompactL0BypassPageCacheValidation {
+    JustReadBoth,
+    JustReadBothConcurrentIo,
    /// Validate that the series of (key, lsn) pairs are the same.
    KeyLsn,
+    // Like [`KeyLsn`], but perform the IO concurrently.
+    KeyLsnConcurrentIo,
    /// Validate that the entire output of old and new way is identical.
    KeyLsnValue,
+    // Like [`KeyLsnValue`], but perform the IO concurrently.
+    KeyLsnValueConcurrentIo,
 }

 impl Default for CompactL0Phase1ValueAccess {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            tenant_shard_id,
+            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
            async move {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    pgdatadir_mapping::DatadirModification,
-    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -345,10 +345,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size
-                            || modification.approx_pending_bytes()
-                                > DatadirModification::MAX_PENDING_BYTES
-                        {
+                        if uncommitted_records >= ingest_batch_size {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -25,10 +25,9 @@ use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::VirtualFile;

 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -61,7 +60,7 @@ pub struct VectoredBlobsBuf {
 pub struct VectoredRead {
    pub start: u64,
    pub end: u64,
-    /// Start offset and metadata for each blob in this read
+    /// Starting offsets and metadata for each blob in this read
    pub blobs_at: VecMap<u64, BlobMeta>,
 }

@@ -77,109 +76,14 @@ pub(crate) enum VectoredReadExtended {
    No,
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VectoredReadCoalesceMode {
-    /// Only coalesce exactly adjacent reads.
-    AdjacentOnly,
-    /// In addition to adjacent reads, also consider reads whose corresponding
-    /// `end` and `start` offsets reside at the same chunk.
-    Chunked(usize),
-}
-
-impl VectoredReadCoalesceMode {
-    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
-    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
-    pub(crate) fn get() -> Self {
-        let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-            VectoredReadCoalesceMode::AdjacentOnly
-        } else {
-            VectoredReadCoalesceMode::Chunked(align)
-        }
-    }
-}
-
-pub(crate) enum VectoredReadBuilder {
-    Adjacent(AdjacentVectoredReadBuilder),
-    Chunked(ChunkedVectoredReadBuilder),
-}
-
-impl VectoredReadBuilder {
-    fn new_impl(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        match mode {
-            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
-                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
-            ),
-            VectoredReadCoalesceMode::Chunked(chunk_size) => {
-                Self::Chunked(ChunkedVectoredReadBuilder::new(
-                    start_offset,
-                    end_offset,
-                    meta,
-                    max_read_size,
-                    chunk_size,
-                ))
-            }
-        }
-    }
-
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: usize,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
-    }
-
-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, mode)
-    }
-
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
-            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
-        }
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.build(),
-            VectoredReadBuilder::Chunked(builder) => builder.build(),
-        }
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.size(),
-            VectoredReadBuilder::Chunked(builder) => builder.size(),
-        }
-    }
-}
-
-pub(crate) struct AdjacentVectoredReadBuilder {
-    /// Start offset of the read.
+pub(crate) struct VectoredReadBuilder {
    start: u64,
-    // End offset of the read.
    end: u64,
-    /// Start offset and metadata for each blob in this read
    blobs_at: VecMap<u64, BlobMeta>,
    max_read_size: Option<usize>,
 }

-impl AdjacentVectoredReadBuilder {
+impl VectoredReadBuilder {
    /// Start building a new vectored read.
    ///
    /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -189,7 +93,7 @@ impl AdjacentVectoredReadBuilder {
        start_offset: u64,
        end_offset: u64,
        meta: BlobMeta,
-        max_read_size: Option<usize>,
+        max_read_size: usize,
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
@@ -200,7 +104,7 @@ impl AdjacentVectoredReadBuilder {
            start: start_offset,
            end: end_offset,
            blobs_at,
-            max_read_size,
+            max_read_size: Some(max_read_size),
        }
    }
    /// Attempt to extend the current read with a new blob if the start
@@ -209,15 +113,13 @@ impl AdjacentVectoredReadBuilder {
    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
        tracing::trace!(start, end, "trying to extend");
        let size = (end - start) as usize;
-        let not_limited_by_max_read_size = {
+        if self.end == start && {
            if let Some(max_read_size) = self.max_read_size {
                self.size() + size <= max_read_size
            } else {
                true
            }
-        };
-
-        if self.end == start && not_limited_by_max_read_size {
+        } {
            self.end = end;
            self.blobs_at
                .append(start, meta)
@@ -242,107 +144,6 @@ impl AdjacentVectoredReadBuilder {
    }
 }

-pub(crate) struct ChunkedVectoredReadBuilder {
-    /// Start block number
-    start_blk_no: usize,
-    /// End block number (exclusive).
-    end_blk_no: usize,
-    /// Start offset and metadata for each blob in this read
-    blobs_at: VecMap<u64, BlobMeta>,
-    max_read_size: Option<usize>,
-    /// Chunk size reads are coalesced into.
-    chunk_size: usize,
-}
-
-/// Computes x / d rounded up.
-fn div_round_up(x: usize, d: usize) -> usize {
-    (x + (d - 1)) / d
-}
-
-impl ChunkedVectoredReadBuilder {
-    /// Start building a new vectored read.
-    ///
-    /// Note that by design, this does not check against reading more than `max_read_size` to
-    /// support reading larger blobs than the configuration value. The builder will be single use
-    /// however after that.
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        chunk_size: usize,
-    ) -> Self {
-        let mut blobs_at = VecMap::default();
-        blobs_at
-            .append(start_offset, meta)
-            .expect("First insertion always succeeds");
-
-        let start_blk_no = start_offset as usize / chunk_size;
-        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
-        Self {
-            start_blk_no,
-            end_blk_no,
-            blobs_at,
-            max_read_size,
-            chunk_size,
-        }
-    }
-
-    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
-    ///
-    /// The resulting size also must be below the max read size.
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        tracing::trace!(start, end, "trying to extend");
-        let start_blk_no = start as usize / self.chunk_size;
-        let end_blk_no = div_round_up(end as usize, self.chunk_size);
-
-        let not_limited_by_max_read_size = {
-            if let Some(max_read_size) = self.max_read_size {
-                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
-                coalesced_size <= max_read_size
-            } else {
-                true
-            }
-        };
-
-        // True if the second block starts in the same block or the immediate next block where the first block ended.
-        //
-        // Note: This automatically handles the case where two blocks are adjacent to each other,
-        // whether they starts on chunk size boundary or not.
-        let is_adjacent_chunk_read = {
-            // 1. first.end & second.start are in the same block
-            self.end_blk_no == start_blk_no + 1 ||
-            // 2. first.end ends one block before second.start
-            self.end_blk_no == start_blk_no
-        };
-
-        if is_adjacent_chunk_read && not_limited_by_max_read_size {
-            self.end_blk_no = end_blk_no;
-            self.blobs_at
-                .append(start, meta)
-                .expect("LSNs are ordered within vectored reads");
-
-            return VectoredReadExtended::Yes;
-        }
-
-        VectoredReadExtended::No
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        (self.end_blk_no - self.start_blk_no) * self.chunk_size
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        let start = (self.start_blk_no * self.chunk_size) as u64;
-        let end = (self.end_blk_no * self.chunk_size) as u64;
-        VectoredRead {
-            start,
-            end,
-            blobs_at: self.blobs_at,
-        }
-    }
-}
-
 #[derive(Copy, Clone, Debug)]
 pub enum BlobFlag {
    None,
@@ -365,18 +166,14 @@ pub struct VectoredReadPlanner {
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

    max_read_size: usize,
-
-    mode: VectoredReadCoalesceMode,
 }

 impl VectoredReadPlanner {
    pub fn new(max_read_size: usize) -> Self {
-        let mode = VectoredReadCoalesceMode::get();
        Self {
            blobs: BTreeMap::new(),
            prev: None,
            max_read_size,
-            mode,
        }
    }

@@ -455,7 +252,6 @@ impl VectoredReadPlanner {
                        end_offset,
                        BlobMeta { key, lsn },
                        self.max_read_size,
-                        self.mode,
                    );

                    let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -507,18 +303,6 @@ impl<'a> VectoredBlobReader<'a> {
            read.size(),
            buf.capacity()
        );
-
-        if cfg!(debug_assertions) {
-            let align = virtual_file::get_io_buffer_alignment() as u64;
-            debug_assert_eq!(
-                read.start % align,
-                0,
-                "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
-                read.start,
-                align
-            );
-        }
-
        let mut buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
@@ -526,20 +310,27 @@ impl<'a> VectoredBlobReader<'a> {
            .into_inner();

        let blobs_at = read.blobs_at.as_slice();
-
-        let start_offset = read.start;
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;

        let mut metas = Vec::with_capacity(blobs_at.len());
+
        // Blobs in `read` only provide their starting offset. The end offset
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );

        // Some scratch space, put here for reusing the allocation
        let mut decompressed_vec = Vec::new();

-        for (blob_start, meta) in blobs_at {
-            let blob_start_in_buf = blob_start - start_offset;
-            let first_len_byte = buf[blob_start_in_buf as usize];
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];

            // Each blob is prefixed by a header containing its size and compression information.
            // Extract the size and skip that header to find the start of the data.
@@ -549,7 +340,7 @@ impl<'a> VectoredBlobReader<'a> {
                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
            } else {
                let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = blob_start_in_buf as usize;
+                let offset_in_buf = offset_in_buf as usize;

                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
@@ -562,8 +353,12 @@ impl<'a> VectoredBlobReader<'a> {
                )
            };

-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
+            let start_raw = offset_in_buf + size_length;
+            let end_raw = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start_raw + blob_size,
+            };
+            assert_eq!(end_raw - start_raw, blob_size);
            let (start, end);
            if compression_bits == BYTE_UNCOMPRESSED {
                start = start_raw as usize;
@@ -612,22 +407,18 @@ pub struct StreamingVectoredReadPlanner {
    max_cnt: usize,
    /// Size of the current batch
    cnt: usize,
-
-    mode: VectoredReadCoalesceMode,
 }

 impl StreamingVectoredReadPlanner {
    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
-        let mode = VectoredReadCoalesceMode::get();
        Self {
            read_builder: None,
            prev: None,
            max_cnt,
            max_read_size,
            cnt: 0,
-            mode,
        }
    }

@@ -676,12 +467,17 @@ impl StreamingVectoredReadPlanner {
            }
            None => {
                self.read_builder = {
-                    Some(VectoredReadBuilder::new_streaming(
-                        start_offset,
-                        end_offset,
-                        BlobMeta { key, lsn },
-                        self.mode,
-                    ))
+                    let mut blobs_at = VecMap::default();
+                    blobs_at
+                        .append(start_offset, BlobMeta { key, lsn })
+                        .expect("First insertion always succeeds");
+
+                    Some(VectoredReadBuilder {
+                        start: start_offset,
+                        end: end_offset,
+                        blobs_at,
+                        max_read_size: None,
+                    })
                };
            }
        }
@@ -715,9 +511,7 @@ mod tests {
    use super::*;

    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        let align = virtual_file::get_io_buffer_alignment() as u64;
-        assert_eq!(read.start % align, 0);
-        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
+        assert_eq!(read.start, offset_range.first().unwrap().2);

        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();

@@ -731,63 +525,6 @@ mod tests {
        assert_eq!(expected_offsets_in_read, offsets_in_read);
    }

-    #[test]
-    fn planner_chunked_coalesce_all_test() {
-        use crate::virtual_file;
-
-        const CHUNK_SIZE: u64 = 512;
-        virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
-        let max_read_size = CHUNK_SIZE as usize * 8;
-        let key = Key::MIN;
-        let lsn = Lsn(0);
-
-        let blob_descriptions = [
-            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
-        ];
-
-        let ranges = [
-            &[
-                blob_descriptions[0],
-                blob_descriptions[2],
-                blob_descriptions[4],
-                blob_descriptions[5],
-                blob_descriptions[7],
-                blob_descriptions[8],
-                blob_descriptions[10],
-            ],
-            &blob_descriptions[11..12],
-            &blob_descriptions[13..],
-        ];
-
-        let mut planner = VectoredReadPlanner::new(max_read_size);
-        for (key, lsn, offset, flag) in blob_descriptions {
-            planner.handle(key, lsn, offset, flag);
-        }
-
-        planner.handle_range_end(652 * 1024);
-
-        let reads = planner.finish();
-
-        assert_eq!(reads.len(), ranges.len());
-
-        for (idx, read) in reads.iter().enumerate() {
-            validate_read(read, ranges[idx]);
-        }
-    }
-
    #[test]
    fn planner_max_read_size_test() {
        let max_read_size = 128 * 1024;
@@ -1000,7 +737,6 @@ mod tests {
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
        let mut buf = BytesMut::with_capacity(reserved_bytes);

-        let mode = VectoredReadCoalesceMode::get();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
        let meta = BlobMeta {
            key: Key::MIN,
@@ -1012,7 +748,7 @@ mod tests {
            if idx + 1 == offsets.len() {
                continue;
            }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
            let read = read_builder.build();
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
@@ -1048,12 +784,4 @@ mod tests {
        round_trip_test_compressed(&blobs, true).await?;
        Ok(())
    }
-
-    #[test]
-    fn test_div_round_up() {
-        const CHUNK_SIZE: usize = 512;
-        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
-        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
-        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
-    }
 }
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, tenant::mgr::TenantManager};

 pub(crate) fn regenerate(
    conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
        disk_usable_pct,
        shard_count,
        max_shard_count: MAX_SHARDS,
-        utilization_score: None,
+        utilization_score: 0,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    // Initialize `PageserverUtilization::utilization_score`
-    let score = doc.cached_score();
-    NODE_UTILIZATION_SCORE.set(score);
+    doc.refresh_score();
+
+    // TODO: make utilization_score into a metric

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};

@@ -757,23 +756,11 @@ impl VirtualFile {
        })
    }

-    /// The function aborts the process if the error is fatal.
    async fn write_at<B: IoBuf + Send>(
        &self,
        buf: FullSlice<B>,
        offset: u64,
        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (FullSlice<B>, Result<usize, Error>) {
-        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
-        let result = result.maybe_fatal_err("write_at");
-        (slice, result)
-    }
-
-    async fn write_at_inner<B: IoBuf + Send>(
-        &self,
-        buf: FullSlice<B>,
-        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -1141,13 +1128,10 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
+pub fn init(num_slots: usize, engine: IoEngineKind) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
-    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
-    }
    io_engine::init(engine);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1171,61 +1155,6 @@ fn get_open_files() -> &'static OpenFiles {
    }
 }

-static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
-
-/// Returns true if `x` is zero or a power of two.
-fn is_zero_or_power_of_two(x: usize) -> bool {
-    (x == 0) || ((x & (x - 1)) == 0)
-}
-
-#[allow(unused)]
-pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_zero_or_power_of_two(align) {
-        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
-        Ok(())
-    } else {
-        Err(align)
-    }
-}
-
-/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
-///
-/// This function should be used to check the raw config value.
-pub(crate) fn get_io_buffer_alignment_raw() -> usize {
-    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
-
-    if cfg!(test) {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-            if let Some(test_align) = utils::env::var(env_var_name) {
-                if is_zero_or_power_of_two(test_align) {
-                    test_align
-                } else {
-                    panic!("IO buffer alignment ({test_align}) is not a power of two");
-                }
-            } else {
-                crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
-            }
-        } else {
-            align
-        }
-    } else {
-        align
-    }
-}
-
-/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = get_io_buffer_alignment_raw();
-    if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-        1
-    } else {
-        align
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -41,8 +41,6 @@

 #include "hll.h"

-#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
-
 /*
 * Local file cache is used to temporary store relations pages in local file system.
 * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -53,43 +51,19 @@
 *
 * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
 * its consistency.
-
- *
- * ## Holes
- *
- * The LFC can be resized on the fly, up to a maximum size that's determined
- * at server startup (neon.max_file_cache_size). After server startup, we
- * expand the underlying file when needed, until it reaches the soft limit
- * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
- * the LFC by punching holes in the underlying file with a
- * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
- * shrink, but the disk space it uses does.
- *
- * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
- * 'holes' linked list. They are entered into the chunk hash table, with a
- * special key where the blockNumber is used to store the 'offset' of the
- * hole, and all other fields are zero. Holes are never looked up in the hash
- * table, we only enter them there to have a FileCacheEntry that we can keep
- * in the linked list. If the soft limit is raised again, we reuse the holes
- * before extending the nominal size of the file.
 */

 /* Local file storage allocation chunk.
- * Should be power of two. Using larger than page chunks can
+ * Should be power of two and not less than 32. Using larger than page chunks can
 * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
 *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
 *    1Mb chunks can reduce hash map size to 320Mb.
 * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
 */
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
-/*
- * Smaller chunk seems to be better for OLTP workload
- */
-// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
 #define MB					((uint64)1024*1024)

 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
-#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)

 typedef struct FileCacheEntry
 {
@@ -97,8 +71,8 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[CHUNK_BITMAP_SIZE];
-	dlist_node	list_node;		/* LRU/holes list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
+	dlist_node	lru_node;		/* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
@@ -113,7 +87,6 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;

@@ -162,7 +135,6 @@ lfc_disable(char const *op)
 		lfc_ctl->used = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);

 		if (lfc_desc > 0)
 		{
@@ -242,18 +214,18 @@ lfc_shmem_startup(void)
 	if (!found)
 	{
 		int			fd;
-		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);

 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);

 		/*
-		 * n_chunks+1 because we add new element to hash table before eviction
+		 * lfc_size+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 n_chunks + 1, n_chunks + 1,
+								 lfc_size + 1, lfc_size + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -263,7 +235,6 @@ lfc_shmem_startup(void)
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);

 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -339,31 +310,14 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * Shrink cache by throwing away least recently accessed chunks and
 		 * returning their space to file system
 		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-		FileCacheEntry *hole;
-		uint32		offset = victim->offset;
-		uint32		hash;
-		bool		found;
-		BufferTag	holetag;
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));

-		CriticalAssert(victim->access_count == 0);
+		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
-		/* We remove the old entry, and re-enter a hole to the hash table */
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-
-		memset(&holetag, 0, sizeof(holetag));
-		holetag.blockNum = offset;
-		hash = get_hash_value(lfc_hash, &holetag);
-		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
-		hole->hash = hash;
-		hole->offset = offset;
-		hole->access_count = 0;
-		CriticalAssert(!found);
-		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
-
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
@@ -455,8 +409,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -488,7 +440,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.forkNum = forkNum;
 	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));

-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -519,7 +470,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	{
 		bool		has_remaining_pages;

-		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
 		{
 			if (entry->bitmap[i] != 0)
 			{
@@ -534,8 +485,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		 */
 		if (!has_remaining_pages)
 		{
-			dlist_delete(&entry->list_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			dlist_delete(&entry->lru_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
 		}
 	}

@@ -574,8 +525,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -602,7 +551,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
-		dlist_delete(&entry->list_node);
+		dlist_delete(&entry->lru_node);
 	generation = lfc_ctl->generation;
 	entry_offset = entry->offset;

@@ -620,12 +569,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	if (lfc_ctl->generation == generation)
 	{
-		CriticalAssert(LFC_ENABLED());
+		Assert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
 		pgBufferUsage.file_cache.hits += 1;
-		CriticalAssert(entry->access_count > 0);
+		Assert(entry->access_count > 0);
 		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
 	}
 	else
 		result = false;
@@ -664,8 +613,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -685,7 +632,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
-			dlist_delete(&entry->list_node);
+			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
@@ -708,26 +655,13 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));

-			CriticalAssert(victim->access_count == 0);
+			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 			neon_log(DEBUG2, "Swap file cache page");
 		}
-		else if (!dlist_is_empty(&lfc_ctl->holes))
-		{
-			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-			uint32		offset = hole->offset;
-			bool		found;
-
-			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
-			CriticalAssert(found);
-
-			lfc_ctl->used += 1;
-			entry->offset = offset;	/* reuse the hole */
-		}
 		else
 		{
 			lfc_ctl->used += 1;
@@ -755,11 +689,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void

 		if (lfc_ctl->generation == generation)
 		{
-			CriticalAssert(LFC_ENABLED());
+			Assert(LFC_ENABLED());
 			/* Place entry to the head of LRU list */
-			CriticalAssert(entry->access_count > 0);
+			Assert(entry->access_count > 0);
 			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+				dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);

 			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}
@@ -774,6 +708,7 @@ typedef struct
 } NeonGetStatsCtx;

 #define NUM_NEON_GET_STATS_COLS	2
+#define NUM_NEON_GET_STATS_ROWS	3

 PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
 Datum
@@ -809,6 +744,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);

 		fctx->tupdesc = BlessTupleDesc(tupledesc);
+		funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
 		funcctx->user_fctx = fctx;

 		/* Return to original context when allocating transient memory */
@@ -842,11 +778,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->writes;
 			break;
-		case 4:
-			key = "file_cache_size";
-			if (lfc_ctl)
-				value = lfc_ctl->size;
-			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -970,7 +901,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -192,13 +192,6 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	{
 		XLogRecPtr	cutoff_lsn;

-		/* In case of a SIGHUP, just reload the configuration. */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-		}
-
 		/*
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -54,10 +54,6 @@

 #define BufTagGetNRelFileInfo(tag) tag.rnode

-#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
-
-#define InvalidRelFileNumber InvalidOid
-
 #define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)

--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,8 +110,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		/* We need exclusive lock here because of LRU list manipulation */
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		LWLockAcquire(relsize_lock, LW_SHARED);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,7 +2,6 @@

 import argparse
 import enum
-import os
 import subprocess
 import sys
 from typing import List
@@ -94,7 +93,7 @@ if __name__ == "__main__":
        "--no-color",
        action="store_true",
        help="disable colored output",
-        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
+        default=not sys.stdout.isatty(),
    )
    args = parser.parse_args()

--- a/proxy/README.md
+++ b/proxy/README.md
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```

 If both postgres and proxy are running you may send a SQL query:
-```console
+```json
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
  -H 'Content-Type: application/json' \
@@ -44,8 +44,7 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
  }' | jq
-```
-```json
+
 {
  "command": "SELECT",
  "fields": [
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,36 +113,38 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {

 impl UserFacingError for AuthError {
    fn to_string_client(&self) -> String {
+        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.to_string_client(),
-            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
-            AuthErrorImpl::Sasl(e) => e.to_string_client(),
-            AuthErrorImpl::AuthFailed(_) => self.to_string(),
-            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
-            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
-            AuthErrorImpl::MissingEndpointName => self.to_string(),
-            AuthErrorImpl::Io(_) => "Internal error".to_string(),
-            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
-            AuthErrorImpl::TooManyConnections => self.to_string(),
-            AuthErrorImpl::UserTimeout(_) => self.to_string(),
+            Link(e) => e.to_string_client(),
+            GetAuthInfo(e) => e.to_string_client(),
+            Sasl(e) => e.to_string_client(),
+            AuthFailed(_) => self.to_string(),
+            BadAuthMethod(_) => self.to_string(),
+            MalformedPassword(_) => self.to_string(),
+            MissingEndpointName => self.to_string(),
+            Io(_) => "Internal error".to_string(),
+            IpAddressNotAllowed(_) => self.to_string(),
+            TooManyConnections => self.to_string(),
+            UserTimeout(_) => self.to_string(),
        }
    }
 }

 impl ReportableError for AuthError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.get_error_kind(),
-            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
-            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
-            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
-            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
-            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
+            Link(e) => e.get_error_kind(),
+            GetAuthInfo(e) => e.get_error_kind(),
+            Sasl(e) => e.get_error_kind(),
+            AuthFailed(_) => crate::error::ErrorKind::User,
+            BadAuthMethod(_) => crate::error::ErrorKind::User,
+            MalformedPassword(_) => crate::error::ErrorKind::User,
+            MissingEndpointName => crate::error::ErrorKind::User,
+            Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            TooManyConnections => crate::error::ErrorKind::RateLimit,
+            UserTimeout(_) => crate::error::ErrorKind::User,
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,7 +2,6 @@ mod classic;
 mod hacks;
 pub mod jwt;
 mod link;
-pub mod local;

 use std::net::IpAddr;
 use std::sync::Arc;
@@ -10,7 +9,6 @@ use std::time::Duration;

 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
-use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -70,8 +68,6 @@ pub enum BackendType<'a, T, D> {
    Console(MaybeOwned<'a, ConsoleBackend>, T),
    /// Authentication via a web browser.
    Link(MaybeOwned<'a, url::ApiUrl>, D),
-    /// Local proxy uses configured auth credentials and does not wake compute
-    Local(MaybeOwned<'a, LocalBackend>),
 }

 pub trait TestBackend: Send + Sync + 'static {
@@ -84,8 +80,9 @@ pub trait TestBackend: Send + Sync + 'static {

 impl std::fmt::Display for BackendType<'_, (), ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use BackendType::*;
        match self {
-            Self::Console(api, _) => match &**api {
+            Console(api, _) => match &**api {
                ConsoleBackend::Console(endpoint) => {
                    fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                }
@@ -96,8 +93,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                #[cfg(test)]
                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
            },
-            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            Self::Local(_) => fmt.debug_tuple("Local").finish(),
+            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
        }
    }
 }
@@ -106,10 +102,10 @@ impl<T, D> BackendType<'_, T, D> {
    /// Very similar to [`std::option::Option::as_ref`].
    /// This helps us pass structured config to async tasks.
    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
+        use BackendType::*;
        match self {
-            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
-            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
-            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
+            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
+            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
        }
    }
 }
@@ -119,10 +115,10 @@ impl<'a, T, D> BackendType<'a, T, D> {
    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
    /// a function to a contained value.
    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
+        use BackendType::*;
        match self {
-            Self::Console(c, x) => BackendType::Console(c, f(x)),
-            Self::Link(c, x) => BackendType::Link(c, x),
-            Self::Local(l) => BackendType::Local(l),
+            Console(c, x) => Console(c, f(x)),
+            Link(c, x) => Link(c, x),
        }
    }
 }
@@ -130,10 +126,10 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
    /// Very similar to [`std::option::Option::transpose`].
    /// This is most useful for error handling.
    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
+        use BackendType::*;
        match self {
-            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
-            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
-            Self::Local(l) => Ok(BackendType::Local(l)),
+            Console(c, x) => x.map(|x| Console(c, x)),
+            Link(c, x) => Ok(Link(c, x)),
        }
    }
 }
@@ -165,7 +161,6 @@ impl ComputeUserInfo {
 pub enum ComputeCredentialKeys {
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
-    None,
 }

 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -298,9 +293,7 @@ async fn auth_quirks(
            ctx.set_endpoint_id(res.info.endpoint.clone());
            let password = match res.keys {
                ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
-                    unreachable!("password hack should return a password")
-                }
+                _ => unreachable!("password hack should return a password"),
            };
            (res.info, Some(password))
        }
@@ -407,19 +400,21 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<EndpointId> {
+        use BackendType::*;
+
        match self {
-            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
-            Self::Link(_, _) => Some("link".into()),
-            Self::Local(_) => Some("local".into()),
+            Console(_, user_info) => user_info.endpoint_id.clone(),
+            Link(_, _) => Some("link".into()),
        }
    }

    /// Get username from the credentials.
    pub fn get_user(&self) -> &str {
+        use BackendType::*;
+
        match self {
-            Self::Console(_, user_info) => &user_info.user,
-            Self::Link(_, _) => "link",
-            Self::Local(_) => "local",
+            Console(_, user_info) => &user_info.user,
+            Link(_, _) => "link",
        }
    }

@@ -433,8 +428,10 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
        config: &'static AuthenticationConfig,
        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
+        use BackendType::*;
+
        let res = match self {
-            Self::Console(api, user_info) => {
+            Console(api, user_info) => {
                info!(
                    user = &*user_info.user,
                    project = user_info.endpoint(),
@@ -454,16 +451,13 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                BackendType::Console(api, credentials)
            }
            // NOTE: this auth backend doesn't use client credentials.
-            Self::Link(url, _) => {
+            Link(url, _) => {
                info!("performing link authentication");

                let info = link::authenticate(ctx, &url, client).await?;

                BackendType::Link(url, info)
            }
-            Self::Local(_) => {
-                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
-            }
        };

        info!("user successfully authenticated");
@@ -476,10 +470,10 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        use BackendType::*;
        match self {
-            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
-            Self::Local(_) => Ok(Cached::new_uncached(None)),
+            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Link(_, _) => Ok(Cached::new_uncached(None)),
        }
    }

@@ -487,10 +481,10 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        use BackendType::*;
        match self {
-            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
        }
    }
 }
@@ -501,18 +495,18 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
        match self {
-            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
-            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
        }
    }

-    fn get_keys(&self) -> &ComputeCredentialKeys {
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
        match self {
-            Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
-            Self::Local(_) => &ComputeCredentialKeys::None,
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
        }
    }
 }
@@ -523,18 +517,18 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
        match self {
-            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
-            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
        }
    }

-    fn get_keys(&self) -> &ComputeCredentialKeys {
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
        match self {
-            Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
-            Self::Local(_) => &ComputeCredentialKeys::None,
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
        }
    }
 }
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,21 +1,15 @@
-use std::{
-    future::Future,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{future::Future, sync::Arc, time::Duration};

 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
-use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;

-use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};

 // TODO(conrad): make these configurable.
-const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -23,56 +17,30 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;

 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(
-        &self,
-        role_name: RoleName,
-    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
 }

-pub struct AuthRule {
-    pub id: String,
-    pub jwks_url: url::Url,
-    pub audience: Option<String>,
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
 }

 #[derive(Default)]
 pub struct JwkCache {
    client: reqwest::Client,

-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
-}
-
-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<String, KeySet>,
-}
-
-impl JwkCacheEntry {
-    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
-        self.key_sets.values().find_map(|key_set| {
-            key_set
-                .find_key(key_id)
-                .map(|jwk| (jwk, key_set.audience.as_deref()))
-        })
-    }
-}
-
-struct KeySet {
-    jwks: jose_jwk::JwkSet,
-    audience: Option<String>,
-}
-
-impl KeySet {
-    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
-        self.jwks
-            .keys
-            .iter()
-            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
-    }
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
 }

 pub struct JwkCacheEntryLock {
@@ -89,6 +57,15 @@ impl Default for JwkCacheEntryLock {
    }
 }

+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
 impl JwkCacheEntryLock {
    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
        JwkRenewalPermit::acquire_permit(self).await
@@ -102,7 +79,6 @@ impl JwkCacheEntryLock {
        &self,
        _permit: JwkRenewalPermit<'_>,
        client: &reqwest::Client,
-        role_name: RoleName,
        auth_rules: &F,
    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
        // double check that no one beat us to updating the cache.
@@ -115,19 +91,20 @@ impl JwkCacheEntryLock {
            }
        }

-        let rules = auth_rules.fetch_auth_rules(role_name).await?;
-        let mut key_sets =
-            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
        // TODO(conrad): run concurrently
        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for rule in rules {
-            let req = client.get(rule.jwks_url.clone());
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
-            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
            match req.send().await.and_then(|r| r.error_for_status()) {
                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
                Ok(r) => {
                    let resp: http::Response<reqwest::Body> = r.into();
                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -136,17 +113,9 @@ impl JwkCacheEntryLock {
                    )
                    .await
                    {
-                        Err(e) => {
-                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
-                        }
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
                        Ok(jwks) => {
-                            key_sets.insert(
-                                rule.id,
-                                KeySet {
-                                    jwks,
-                                    audience: rule.audience,
-                                },
-                            );
+                            key_sets.insert(url, jwks);
                        }
                    }
                }
@@ -164,9 +133,7 @@ impl JwkCacheEntryLock {

    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
        client: &reqwest::Client,
-        role_name: RoleName,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
        let now = Instant::now();
@@ -174,20 +141,18 @@ impl JwkCacheEntryLock {

        // if we have no cached JWKs, try and get some
        let Some(cached) = guard else {
-            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self.renew_jwks(permit, client, fetch).await;
        };

        let last_update = now.duration_since(cached.last_retrieved);

        // check if the cached JWKs need updating.
        if last_update > MAX_RENEW {
-            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;

            // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self.renew_jwks(permit, client, fetch).await;
        }

        // every 5 minutes we should spawn a job to eagerly update the token.
@@ -199,7 +164,7 @@ impl JwkCacheEntryLock {
                let client = client.clone();
                let fetch = fetch.clone();
                tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
                    }
                });
@@ -213,10 +178,8 @@ impl JwkCacheEntryLock {

    async fn check_jwt<F: FetchAuthRules>(
        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        jwt: &str,
+        jwt: String,
        client: &reqwest::Client,
-        role_name: RoleName,
        fetch: &F,
    ) -> Result<(), anyhow::Error> {
        // JWT compact form is defined to be
@@ -226,36 +189,36 @@ impl JwkCacheEntryLock {
        let (header_payload, signature) = jwt
            .rsplit_once(".")
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, payload) = header_payload
+        let (header, _payload) = header_payload
            .split_once(".")
            .context("Provided authentication token is not a valid JWT encoding")?;

        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
+        let header = serde_json::from_slice::<JWTHeader>(&header)
            .context("Provided authentication token is not a valid JWT encoding")?;

        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;

        ensure!(header.typ == "JWT");
-        let kid = header.key_id.context("missing key id")?;
+        let kid = header.kid.context("missing key id")?;

-        let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
-            .await?;
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;

        // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid) {
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
                Some(jwk) => break jwk,
                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
-                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-
                    let permit = self.acquire_permit().await;
-                    guard = self
-                        .renew_jwks(permit, client, role_name.clone(), fetch)
-                        .await?;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
                }
                _ => {
                    bail!("jwk not found");
@@ -264,7 +227,7 @@ impl JwkCacheEntryLock {
        };

        ensure!(
-            jwk.is_supported(&header.algorithm),
+            jwk.is_supported(&header.alg),
            "signature algorithm not supported"
        );

@@ -278,60 +241,31 @@ impl JwkCacheEntryLock {
            key => bail!("unsupported key type {key:?}"),
        };

-        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-
-        tracing::debug!(?payload, "JWT signature valid with claims");
-
-        match (expected_audience, payload.audience) {
-            // check the audience matches
-            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
-            // the audience is expected but is missing
-            (Some(_), None) => bail!("invalid JWT token audience"),
-            // we don't care for the audience field
-            (None, _) => {}
-        }
-
-        let now = SystemTime::now();
-
-        if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
-        }
-
-        if let Some(nbf) = payload.not_before {
-            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
-        }
+        // TODO(conrad): verify iss, exp, nbf, etc...

        Ok(())
    }
 }

 impl JwkCache {
-    pub async fn check_jwt<F: FetchAuthRules>(
+    pub async fn check_jwt(
        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
-        role_name: RoleName,
-        fetch: &F,
-        jwt: &str,
+        endpoint: EndpointIdInt,
+        jwt: String,
    ) -> Result<(), anyhow::Error> {
        // try with just a read lock first
-        let key = (endpoint, role_name.clone());
-        let entry = self.map.get(&key).as_deref().map(Arc::clone);
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
        let entry = match entry {
            Some(entry) => entry,
            None => {
                // acquire a write lock after to insert.
-                let entry = self.map.entry(key).or_default();
+                let entry = self.map.entry(endpoint).or_default();
                Arc::clone(&*entry)
            }
        };

-        entry
-            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
-            .await
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
    }
 }

@@ -381,49 +315,13 @@ fn verify_rsa_signature(

 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JwtHeader<'a> {
+struct JWTHeader<'a> {
    /// must be "JWT"
-    #[serde(rename = "typ")]
    typ: &'a str,
    /// must be a supported alg
-    #[serde(rename = "alg")]
-    algorithm: jose_jwa::Algorithm,
+    alg: jose_jwa::Algorithm,
    /// key id, must be provided for our usecase
-    #[serde(rename = "kid")]
-    key_id: Option<&'a str>,
-}
-
-/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
-#[derive(serde::Deserialize, serde::Serialize, Debug)]
-struct JwtPayload<'a> {
-    /// Audience - Recipient for which the JWT is intended
-    #[serde(rename = "aud")]
-    audience: Option<&'a str>,
-    /// Expiration - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
-    expiration: Option<SystemTime>,
-    /// Not before - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
-    not_before: Option<SystemTime>,
-
-    // the following entries are only extracted for the sake of debug logging.
-    /// Issuer of the JWT
-    #[serde(rename = "iss")]
-    issuer: Option<&'a str>,
-    /// Subject of the JWT (the user)
-    #[serde(rename = "sub")]
-    subject: Option<&'a str>,
-    /// Unique token identifier
-    #[serde(rename = "jti")]
-    jwt_id: Option<&'a str>,
-    /// Unique session identifier
-    #[serde(rename = "sid")]
-    session_id: Option<&'a str>,
-}
-
-fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
-    let d = <Option<u64>>::deserialize(d)?;
-    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
+    kid: Option<&'a str>,
 }

 struct JwkRenewalPermit<'a> {
@@ -442,7 +340,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
        match from.lookup.acquire().await {
            Ok(permit) => {
                permit.forget();
@@ -454,7 +352,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
        match from.lookup.try_acquire() {
            Ok(permit) => {
                permit.forget();
@@ -490,8 +388,6 @@ impl Drop for JwkRenewalPermit<'_> {

 #[cfg(test)]
 mod tests {
-    use crate::RoleName;
-
    use super::*;

    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -535,10 +431,10 @@ mod tests {
    }

    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JwtHeader {
+        let header = JWTHeader {
            typ: "JWT",
-            algorithm: jose_jwa::Algorithm::Signing(sig),
-            key_id: Some(&kid),
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
        };
        let body = typed_json::json! {{
            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -628,40 +524,33 @@ mod tests {
        struct Fetch(SocketAddr);

        impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(
-                &self,
-                _role_name: RoleName,
-            ) -> anyhow::Result<Vec<AuthRule>> {
-                Ok(vec![
-                    AuthRule {
-                        id: "foo".to_owned(),
-                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
-                        audience: None,
-                    },
-                    AuthRule {
-                        id: "bar".to_owned(),
-                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
-                        audience: None,
-                    },
-                ])
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
            }
        }

-        let role_name = RoleName::from("user");
-
        let jwk_cache = Arc::new(JwkCacheEntryLock::default());

-        for token in [jwt1, jwt2, jwt3, jwt4] {
-            jwk_cache
-                .check_jwt(
-                    &RequestMonitoring::test(),
-                    &token,
-                    &client,
-                    role_name.clone(),
-                    &Fetch(addr),
-                )
-                .await
-                .unwrap();
-        }
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
    }
 }
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,79 +0,0 @@
-use std::{collections::HashMap, net::SocketAddr};
-
-use anyhow::Context;
-use arc_swap::ArcSwapOption;
-
-use crate::{
-    compute::ConnCfg,
-    console::{
-        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
-        NodeInfo,
-    },
-    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    RoleName,
-};
-
-use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
-
-pub struct LocalBackend {
-    pub jwks_cache: JwkCache,
-    pub postgres_addr: SocketAddr,
-    pub node_info: NodeInfo,
-}
-
-impl LocalBackend {
-    pub fn new(postgres_addr: SocketAddr) -> Self {
-        LocalBackend {
-            jwks_cache: JwkCache::default(),
-            postgres_addr,
-            node_info: NodeInfo {
-                config: {
-                    let mut cfg = ConnCfg::new();
-                    cfg.host(&postgres_addr.ip().to_string());
-                    cfg.port(postgres_addr.port());
-                    cfg
-                },
-                // TODO(conrad): make this better reflect compute info rather than endpoint info.
-                aux: MetricsAuxInfo {
-                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
-                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
-                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
-                    cold_start_info: ColdStartInfo::WarmCached,
-                },
-                allow_self_signed_compute: false,
-            },
-        }
-    }
-}
-
-#[derive(Clone, Copy)]
-pub struct StaticAuthRules;
-
-pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
-
-#[derive(Debug, Clone)]
-pub struct JwksRoleSettings {
-    pub roles: HashMap<RoleName, EndpointJwksResponse>,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
-}
-
-impl FetchAuthRules for StaticAuthRules {
-    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
-        let mappings = JWKS_ROLE_MAP.load();
-        let role_mappings = mappings
-            .as_deref()
-            .and_then(|m| m.roles.get(&role_name))
-            .context("JWKs settings for this role were not configured")?;
-        let mut rules = vec![];
-        for setting in &role_mappings.jwks {
-            rules.push(AuthRule {
-                id: setting.id.clone(),
-                jwks_url: setting.jwks_url.clone(),
-                audience: setting.jwt_audience.clone(),
-            });
-        }
-
-        Ok(rules)
-    }
-}
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,12 +89,10 @@ impl ComputeUserInfoMaybeEndpoint {
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
    ) -> Result<Self, ComputeUserInfoParseError> {
+        use ComputeUserInfoParseError::*;
+
        // Some parameters are stored in the startup message.
-        let get_param = |key| {
-            params
-                .get(key)
-                .ok_or(ComputeUserInfoParseError::MissingKey(key))
-        };
+        let get_param = |key| params.get(key).ok_or(MissingKey(key));
        let user: RoleName = get_param("user")?.into();

        // Project name might be passed via PG's command-line options.
@@ -124,14 +122,11 @@ impl ComputeUserInfoMaybeEndpoint {
        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
-                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
-                    domain,
-                    option,
-                }))
+                Some(Err(InconsistentProjectNames { domain, option }))
            }
            // Invariant: project name may not contain certain characters.
            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
+                false => Err(MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
@@ -191,7 +186,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
        impl<'de> serde::de::Visitor<'de> for StrVisitor {
            type Value = IpPattern;

-            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
            }

--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,316 +0,0 @@
-use std::{
-    net::SocketAddr,
-    path::{Path, PathBuf},
-    pin::pin,
-    sync::Arc,
-    time::Duration,
-};
-
-use anyhow::{bail, ensure};
-use dashmap::DashMap;
-use futures::{future::Either, FutureExt};
-use proxy::{
-    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
-    cancellation::CancellationHandlerMain,
-    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
-    console::{locks::ApiLocks, messages::JwksRoleMapping},
-    http::health_server::AppMetrics,
-    metrics::{Metrics, ThreadPoolMetrics},
-    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
-    scram::threadpool::ThreadPool,
-    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
-};
-
-project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);
-
-use clap::Parser;
-use tokio::{net::TcpListener, task::JoinSet};
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
-
-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
-/// Neon proxy/router
-#[derive(Parser)]
-#[command(version = GIT_VERSION, about)]
-struct LocalProxyCliArgs {
-    /// listen for incoming metrics connections on ip:port
-    #[clap(long, default_value = "127.0.0.1:7001")]
-    metrics: String,
-    /// listen for incoming http connections on ip:port
-    #[clap(long)]
-    http: String,
-    /// timeout for the TLS handshake
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    handshake_timeout: tokio::time::Duration,
-    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
-    connect_compute_lock: String,
-    #[clap(flatten)]
-    sql_over_http: SqlOverHttpArgs,
-    /// User rate limiter max number of requests per second.
-    ///
-    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
-    /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
-    user_rps_limit: Vec<RateBucketInfo>,
-    /// Whether the auth rate limiter actually takes effect (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    auth_rate_limit_enabled: bool,
-    /// Authentication rate limiter max number of hashes per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
-    auth_rate_limit: Vec<RateBucketInfo>,
-    /// The IP subnet to use when considering whether two IP addresses are considered the same.
-    #[clap(long, default_value_t = 64)]
-    auth_rate_limit_ip_subnet: u8,
-    /// Whether to retry the connection to the compute node
-    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
-    connect_to_compute_retry: String,
-    /// Address of the postgres server
-    #[clap(long, default_value = "127.0.0.1:5432")]
-    compute: SocketAddr,
-    /// File address of the local proxy config file
-    #[clap(long, default_value = "./localproxy.json")]
-    config_path: PathBuf,
-}
-
-#[derive(clap::Args, Clone, Copy, Debug)]
-struct SqlOverHttpArgs {
-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 200)]
-    sql_over_http_pool_max_total_conns: usize,
-
-    /// How long pooled connections should remain idle for before closing
-    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
-    sql_over_http_idle_timeout: tokio::time::Duration,
-
-    #[clap(long, default_value_t = 100)]
-    sql_over_http_client_conn_threshold: u64,
-
-    #[clap(long, default_value_t = 16)]
-    sql_over_http_cancel_set_shards: usize,
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let _logging_guard = proxy::logging::init().await?;
-    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
-    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
-
-    info!("Version: {GIT_VERSION}");
-    info!("Build_tag: {BUILD_TAG}");
-    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    });
-
-    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
-        Ok(t) => Some(t),
-        Err(e) => {
-            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
-            None
-        }
-    };
-
-    let args = LocalProxyCliArgs::parse();
-    let config = build_config(&args)?;
-
-    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
-    let http_listener = TcpListener::bind(args.http).await?;
-    let shutdown = CancellationToken::new();
-
-    // todo: should scale with CU
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-        LeakyBucketConfig {
-            rps: 10.0,
-            max: 100.0,
-        },
-        16,
-    ));
-
-    refresh_config(args.config_path.clone()).await;
-
-    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
-        refresh_config(args.config_path.clone()).map(Ok)
-    }));
-    maintenance_tasks.spawn(proxy::http::health_server::task_main(
-        metrics_listener,
-        AppMetrics {
-            jemalloc,
-            neon_metrics,
-            proxy: proxy::metrics::Metrics::get(),
-        },
-    ));
-
-    let task = serverless::task_main(
-        config,
-        http_listener,
-        shutdown.clone(),
-        Arc::new(CancellationHandlerMain::new(
-            Arc::new(DashMap::new()),
-            None,
-            proxy::metrics::CancellationSource::Local,
-        )),
-        endpoint_rate_limiter,
-    );
-
-    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
-        // exit immediately on maintenance task completion
-        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
-        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
-        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
-        // exit immediately on client task error
-        Either::Right((res, _)) => res?,
-    }
-
-    Ok(())
-}
-
-/// ProxyConfig is created at proxy startup, and lives forever.
-fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
-    let config::ConcurrencyLockOptions {
-        shards,
-        limiter,
-        epoch,
-        timeout,
-    } = args.connect_compute_lock.parse()?;
-    info!(
-        ?limiter,
-        shards,
-        ?epoch,
-        "Using NodeLocks (connect_compute)"
-    );
-    let connect_compute_locks = ApiLocks::new(
-        "connect_compute_lock",
-        limiter,
-        shards,
-        timeout,
-        epoch,
-        &Metrics::get().proxy.connect_compute_lock,
-    )?;
-
-    let http_config = HttpConfig {
-        accept_websockets: false,
-        pool_options: GlobalConnPoolOptions {
-            gc_epoch: Duration::from_secs(60),
-            pool_shards: 2,
-            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
-            opt_in: false,
-
-            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
-            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
-        },
-        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
-        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
-    };
-
-    Ok(Box::leak(Box::new(ProxyConfig {
-        tls_config: None,
-        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
-            LocalBackend::new(args.compute),
-        )),
-        metric_collection: None,
-        allow_self_signed_compute: false,
-        http_config,
-        authentication_config: AuthenticationConfig {
-            thread_pool: ThreadPool::new(0),
-            scram_protocol_timeout: Duration::from_secs(10),
-            rate_limiter_enabled: false,
-            rate_limiter: BucketRateLimiter::new(vec![]),
-            rate_limit_ip_subnet: 64,
-        },
-        require_client_ip: false,
-        handshake_timeout: Duration::from_secs(10),
-        region: "local".into(),
-        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
-        connect_compute_locks,
-        connect_to_compute_retry_config: RetryConfig::parse(
-            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
-        )?,
-    })))
-}
-
-async fn refresh_config(path: PathBuf) {
-    match refresh_config_inner(&path).await {
-        Ok(()) => {}
-        Err(e) => {
-            error!(error=?e, ?path, "could not read config file");
-        }
-    }
-}
-
-async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
-    let bytes = tokio::fs::read(&path).await?;
-    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
-
-    let mut settings = None;
-
-    for mapping in data.roles.values_mut() {
-        for jwks in &mut mapping.jwks {
-            ensure!(
-                jwks.jwks_url.has_authority()
-                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
-                "Invalid JWKS url. Must be HTTP",
-            );
-
-            ensure!(
-                jwks.jwks_url
-                    .host()
-                    .is_some_and(|h| h != url::Host::Domain("")),
-                "Invalid JWKS url. No domain listed",
-            );
-
-            // clear username, password and ports
-            jwks.jwks_url.set_username("").expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-            jwks.jwks_url.set_password(None).expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-            // local testing is hard if we need to have a specific restricted port
-            if cfg!(not(feature = "testing")) {
-                jwks.jwks_url.set_port(None).expect(
-                    "url can be a base and has a valid host and is not a file. should not error",
-                );
-            }
-
-            // clear query params
-            jwks.jwks_url.set_fragment(None);
-            jwks.jwks_url.query_pairs_mut().clear().finish();
-
-            if jwks.jwks_url.scheme() != "https" {
-                // local testing is hard if we need to set up https support.
-                if cfg!(not(feature = "testing")) {
-                    jwks.jwks_url
-                        .set_scheme("https")
-                        .expect("should not error to set the scheme to https if it was http");
-                } else {
-                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
-                }
-            }
-
-            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
-            ensure!(
-                *pr == jwks.project_id,
-                "inconsistent project IDs configured"
-            );
-            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
-        }
-    }
-
-    if let Some((project_id, branch_id)) = settings {
-        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
-            roles: data.roles,
-            project_id,
-            branch_id,
-        })));
-    }
-
-    Ok(())
-}
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> {
        proxy_listener,
        cancellation_token.clone(),
    ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
-        Ok(())
-    }));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));

    // the signal task cant ever succeed.
    // the main task can error, or can succeed on cancellation.
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -148,7 +148,7 @@ struct ProxyCliArgs {
    disable_dynamic_rate_limiter: bool,
    /// Endpoint rate limiter max number of requests per second.
    ///
-    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
    /// Can be given multiple times for different bucket sizes.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    endpoint_rps_limit: Vec<RateBucketInfo>,
@@ -173,6 +173,9 @@ struct ProxyCliArgs {
    /// cache for `role_secret` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    role_secret_cache: String,
+    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    disable_ip_check_for_http: bool,
    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
@@ -447,10 +450,7 @@ async fn main() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(
-        cancellation_token.clone(),
-        || async { Ok(()) },
-    ));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
    maintenance_tasks.spawn(http::health_server::task_main(
        http_listener,
        AppMetrics {
@@ -661,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
-        accept_websockets: true,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
    type LookupInfo<Key> = C::LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info);
+        C::invalidate(self, info)
    }
 }

--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
    type LookupInfo<Key> = LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info);
+        self.invalidate_raw(info)
    }
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Christian Schwarz	7c691fc87f	remove "next" info_span!, it showed up in flamegraphs (not too meaningful though)	2024-08-16 14:40:33 +00:00
Christian Schwarz	fb2d0131e2	add JustReadBoth validation mode	2024-08-16 13:34:01 +00:00
Christian Schwarz	4205300105	revert preceding two WIP parallel mode efforts Not worth the trouble.	2024-08-16 12:26:30 +02:00
Christian Schwarz	edd06c96fa	more WIP to try to get parallel mode	2024-08-16 12:26:30 +02:00
Christian Schwarz	58b857dff2	WIP: parallel mode	2024-08-16 12:26:30 +02:00
Christian Schwarz	26206e8d8a	option for concurrent IO	2024-08-16 12:26:30 +02:00