Merge branch 'yuchen/direct-io-aligned-alloc' into yuchen/direct-io-aligned-alloc-usage-wip

add with_capacity_aligned_zeroed and leak
Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2026-05-31 03:50:37 +00:00 · 2024-10-07 13:08:55 -04:00 · 2024-10-07 13:06:00 -04:00 · 2024-10-01 16:47:36 -04:00 · 2024-10-01 12:31:55 +00:00 · 2024-10-01 12:16:42 +00:00
86 changed files with 2988 additions and 1817 deletions
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps
 on:
  workflow_call:
    # no inputs needed
-    
+
 defaults:
  run:
    shell: bash -euxo pipefail {0}

 jobs:
  setup-databases:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
    strategy:
      fail-fast: false
      matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
        database: [ clickbench, tpch, userexample ]
-  
+
    env:
      LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
      PLATFORM: ${{ matrix.platform }}
@@ -23,7 +27,10 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -32,13 +39,13 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neon)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          aws-rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
            ;;
          aws-aurora-serverless-v2-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}"
@@ -46,10 +53,17 @@ jobs:
            ;;
        esac

-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

    - uses: actions/checkout@v4

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -57,23 +71,23 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
+    # we create a table that has one row for each database that we want to restore with the status whether the restore is done
    - name: Create benchmark_restore_status table if it does not exist
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
        DATABASE_NAME: ${{ matrix.database }}
-      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
+      # to avoid a race condition of multiple jobs trying to create the table at the same time,
      # we use an advisory lock
      run: |
        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        SELECT pg_advisory_lock(4711);  
+        SELECT pg_advisory_lock(4711);
        CREATE TABLE IF NOT EXISTS benchmark_restore_status (
        databasename text primary key,
        restore_done boolean
        );
        SELECT pg_advisory_unlock(4711);
        "
-    
+
    - name: Check if restore is already done
      id: check-restore-done
      env:
@@ -107,7 +121,7 @@ jobs:
        DATABASE_NAME: ${{ matrix.database }}
      run: |
        mkdir -p /tmp/dumps
-        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
+        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/

    - name: Replace database name in connection string
      if: steps.check-restore-done.outputs.skip != 'true'
@@ -126,17 +140,17 @@ jobs:
        else
          new_connstr="${base_connstr}/${DATABASE_NAME}"
        fi
-        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
+        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT

    - name: Restore dump
      if: steps.check-restore-done.outputs.skip != 'true'
      env:
        DATABASE_NAME: ${{ matrix.database }}
        DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
-        # the following works only with larger computes: 
+        # the following works only with larger computes:
        # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
        # we add the || true because:
-        # the dumps were created with Neon and contain neon extensions that are not 
+        # the dumps were created with Neon and contain neon extensions that are not
        # available in RDS, so we will always report an error, but we can ignore it
      run: |
        ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -236,9 +236,7 @@ jobs:

          # run pageserver tests with different settings
          for io_engine in std-fs tokio-epoll-uring ; do
-            for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-            done
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
          done

          # Run separate tests for real S3
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -12,7 +12,6 @@ on:
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:   '0 3 * * *' # run once a day, timezone is utc
-
  workflow_dispatch: # adds ability to run this manually
    inputs:
      region_id:
@@ -59,7 +58,7 @@ jobs:
    permissions:
      contents: write
      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials
    strategy:
      fail-fast: false
      matrix:
@@ -68,12 +67,10 @@ jobs:
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -86,7 +83,10 @@ jobs:

    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: ${{ matrix.IMAGE }}
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -164,6 +164,10 @@ jobs:

  replication-tests:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 16
@@ -174,12 +178,21 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -267,7 +280,7 @@ jobs:
        region_id_default=${{ env.DEFAULT_REGION_ID }}
        runner_default='["self-hosted", "us-east-2", "x64"]'
        runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
+        image_default="neondatabase/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -344,7 +357,7 @@ jobs:
    permissions:
      contents: write
      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials

    strategy:
      fail-fast: false
@@ -371,7 +384,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials # necessary on Azure runners
+    - name: Configure AWS credentials
      uses: aws-actions/configure-aws-credentials@v4
      with:
        aws-region: eu-central-1
@@ -492,17 +505,15 @@ jobs:
    permissions:
      contents: write
      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+      id-token: write # aws-actions/configure-aws-credentials
    strategy:
      fail-fast: false
      matrix:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - PLATFORM: "azure-captest-pgvector"
            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -511,13 +522,16 @@ jobs:
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
+
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: ${{ matrix.IMAGE }}
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -527,17 +541,26 @@ jobs:
    # instead of using Neon artifacts containing pgbench
    - name: Install postgresql-16 where pytest expects it
      run: |
+        # Just to make it easier to test things locally on macOS (with arm64)
+        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+
        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
+        dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
+
        mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
-        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
+
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
+        export LD_LIBRARY_PATH
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
+
        /tmp/neon/pg_install/v16/bin/pgbench --version
        /tmp/neon/pg_install/v16/bin/psql --version

@@ -559,7 +582,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+    - name: Configure AWS credentials
      uses: aws-actions/configure-aws-credentials@v4
      with:
        aws-region: eu-central-1
@@ -620,6 +643,10 @@ jobs:
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
    needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]

    strategy:
@@ -638,12 +665,22 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -714,6 +751,10 @@ jobs:
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
    needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]

    strategy:
@@ -731,12 +772,22 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -806,6 +857,10 @@ jobs:

  user-examples-compare:
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
    needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]

    strategy:
@@ -822,12 +877,22 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1190,10 +1190,9 @@ jobs:

              files_to_promote+=("s3://${BUCKET}/${s3_key}")

-              # TODO Add v17
-              for pg_version in v14 v15 v16; do
+              for pg_version in v14 v15 v16 v17; do
                # We run less tests for debug builds, so we don't need to promote them
-                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
+                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then
                  continue
                fi

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,6 +1321,7 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "futures",
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
@@ -4296,6 +4297,7 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
+ "compute_api",
 "consumption_metrics",
 "dashmap",
 "ecdsa 0.16.9",
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]

 # System deps
+#
+# 'gdb' is included so that we get backtraces of core dumps produced in
+# regression tests
 RUN set -e \
    && apt update \
    && apt install -y \
@@ -24,6 +27,7 @@ RUN set -e \
        cmake \
        curl \
        flex \
+        gdb \
        git \
        gnupg \
        gzip \
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -11,6 +11,10 @@ commands:
    user: root
    sysvInitAction: sysinit
    shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: chmod-set-disk-quota
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
@@ -30,11 +34,12 @@ commands:
 shutdownHook: |
  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
-  - filename: compute_ctl-resize-swap
+  - filename: compute_ctl-sudoers
    content: |
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
-      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
+      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
+      # regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
@@ -100,7 +105,7 @@ merge: |
      && apt install --no-install-recommends -y \
             sudo \
      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+  COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers

  COPY cgconfig.conf /etc/cgconfig.conf

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");
    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
+    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");

    Ok(ProcessCliResult {
        connstr,
@@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
        spec_json,
        spec_path,
        resize_swap_on_bind,
+        set_disk_quota_for_fs,
    })
 }

@@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> {
    spec_json: Option<&'clap String>,
    spec_path: Option<&'clap String>,
    resize_swap_on_bind: bool,
+    set_disk_quota_for_fs: Option<&'clap String>,
 }

 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -293,6 +297,7 @@ fn wait_spec(
        pgbin,
        ext_remote_storage,
        resize_swap_on_bind,
+        set_disk_quota_for_fs,
        http_port,
        ..
    }: ProcessCliResult,
@@ -373,6 +378,7 @@ fn wait_spec(
        compute,
        http_port,
        resize_swap_on_bind,
+        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
    })
 }

@@ -381,6 +387,7 @@ struct WaitSpecResult {
    // passed through from ProcessCliResult
    http_port: u16,
    resize_swap_on_bind: bool,
+    set_disk_quota_for_fs: Option<String>,
 }

 fn start_postgres(
@@ -390,6 +397,7 @@ fn start_postgres(
        compute,
        http_port,
        resize_swap_on_bind,
+        set_disk_quota_for_fs,
    }: WaitSpecResult,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
@@ -403,6 +411,7 @@ fn start_postgres(
    );
    // before we release the mutex, fetch the swap size (if any) for later.
    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
+    let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
    drop(state);

    // Launch remaining service threads
@@ -422,8 +431,8 @@ fn start_postgres(
        // OOM-killed during startup because swap wasn't available yet.
        match resize_swap(size_bytes) {
            Ok(()) => {
-                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_gib, "resized swap");
+                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_mib, "resized swap");
            }
            Err(err) => {
                let err = err.context("failed to resize swap");
@@ -432,10 +441,29 @@ fn start_postgres(
                // Mark compute startup as failed; don't try to start postgres, and report this
                // error to the control plane when it next asks.
                prestartup_failed = true;
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{err:?}"));
-                state.status = ComputeStatus::Failed;
-                compute.state_changed.notify_all();
+                compute.set_failed_status(err);
+                delay_exit = true;
+            }
+        }
+    }
+
+    // Set disk quota if the compute spec says so
+    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
+        (disk_quota_bytes, set_disk_quota_for_fs)
+    {
+        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+            Ok(()) => {
+                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%disk_quota_bytes, %size_mib, "set disk quota");
+            }
+            Err(err) => {
+                let err = err.context("failed to set disk quota");
+                error!("{err:#}");
+
+                // Mark compute startup as failed; don't try to start postgres, and report this
+                // error to the control plane when it next asks.
+                prestartup_failed = true;
+                compute.set_failed_status(err);
                delay_exit = true;
            }
        }
@@ -450,16 +478,7 @@ fn start_postgres(
            Ok(pg) => Some(pg),
            Err(err) => {
                error!("could not start the compute node: {:#}", err);
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{:?}", err));
-                state.status = ComputeStatus::Failed;
-                // Notify others that Postgres failed to start. In case of configuring the
-                // empty compute, it's likely that API handler is still waiting for compute
-                // state change. With this we will notify it that compute is in Failed state,
-                // so control plane will know about it earlier and record proper error instead
-                // of timeout.
-                compute.state_changed.notify_all();
-                drop(state); // unlock
+                compute.set_failed_status(err);
                delay_exit = true;
                None
            }
@@ -750,6 +769,11 @@ fn cli() -> clap::Command {
                .long("resize-swap-on-bind")
                .action(clap::ArgAction::SetTrue),
        )
+        .arg(
+            Arg::new("set-disk-quota-for-fs")
+                .long("set-disk-quota-for-fs")
+                .value_name("SET_DISK_QUOTA_FOR_FS")
+        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
+use std::time::Duration;
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -305,6 +306,13 @@ impl ComputeNode {
        self.state_changed.notify_all();
    }

+    pub fn set_failed_status(&self, err: anyhow::Error) {
+        let mut state = self.state.lock().unwrap();
+        state.error = Some(format!("{err:?}"));
+        state.status = ComputeStatus::Failed;
+        self.state_changed.notify_all();
+    }
+
    pub fn get_status(&self) -> ComputeStatus {
        self.state.lock().unwrap().status
    }
@@ -710,7 +718,7 @@ impl ComputeNode {
        info!("running initdb");
        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
        Command::new(initdb_bin)
-            .args(["-D", pgdata])
+            .args(["--pgdata", pgdata])
            .output()
            .expect("cannot start initdb process");

@@ -1123,6 +1131,9 @@ impl ComputeNode {
    //
    // Use that as a default location and pattern, except macos where core dumps are written
    // to /cores/ directory by default.
+    //
+    // With default Linux settings, the core dump file is called just "core", so check for
+    // that too.
    pub fn check_for_core_dumps(&self) -> Result<()> {
        let core_dump_dir = match std::env::consts::OS {
            "macos" => Path::new("/cores/"),
@@ -1134,8 +1145,17 @@ impl ComputeNode {
        let files = fs::read_dir(core_dump_dir)?;
        let cores = files.filter_map(|entry| {
            let entry = entry.ok()?;
-            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
-            Some(entry.path())
+
+            let is_core_dump = match entry.file_name().to_str()? {
+                n if n.starts_with("core.") => true,
+                "core" => true,
+                _ => false,
+            };
+            if is_core_dump {
+                Some(entry.path())
+            } else {
+                None
+            }
        });

        // Print backtrace for each core dump
@@ -1386,6 +1406,36 @@ LIMIT 100",
        }
        Ok(remote_ext_metrics)
    }
+
+    /// Waits until current thread receives a state changed notification and
+    /// the pageserver connection strings has changed.
+    ///
+    /// The operation will time out after a specified duration.
+    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
+        let state = self.state.lock().unwrap();
+        let old_pageserver_connstr = state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .pageserver_connstr
+            .clone();
+        let mut unchanged = true;
+        let _ = self
+            .state_changed
+            .wait_timeout_while(state, duration, |s| {
+                let pageserver_connstr = &s
+                    .pspec
+                    .as_ref()
+                    .expect("spec must be set")
+                    .pageserver_connstr;
+                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                unchanged
+            })
+            .unwrap();
+        if !unchanged {
+            info!("Pageserver config changed");
+        }
+    }
 }

 pub fn forward_termination_signal() {
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -11,9 +11,17 @@ use crate::compute::ComputeNode;
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
-        let state = compute.state.lock().unwrap();
-        let mut state = compute.state_changed.wait(state).unwrap();
+        let mut state = compute.state.lock().unwrap();

+        // We have to re-check the status after re-acquiring the lock because it could be that
+        // the status has changed while we were waiting for the lock, and we might not need to
+        // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
+        // we are waiting for a condition variable that will never be signaled.
+        if state.status != ComputeStatus::ConfigurationPending {
+            state = compute.state_changed.wait(state).unwrap();
+        }
+
+        // Re-check the status after waking up
        if state.status == ComputeStatus::ConfigurationPending {
            info!("got configuration request");
            state.status = ComputeStatus::Configuration;
--- a/compute_tools/src/disk_quota.rs
+++ b/compute_tools/src/disk_quota.rs
@@ -0,0 +1,25 @@
+use anyhow::Context;
+
+pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
+
+/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
+/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
+pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
+    let size_kb = size_bytes / 1024;
+    // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
+    let child_result = std::process::Command::new("/usr/bin/sudo")
+        .arg(DISK_QUOTA_BIN)
+        .arg(size_kb.to_string())
+        .arg(fs_mountpoint)
+        .spawn();
+
+    child_result
+        .context("spawn() failed")
+        .and_then(|mut child| child.wait().context("wait() failed"))
+        .and_then(|status| match status.success() {
+            true => Ok(()),
+            false => Err(anyhow::anyhow!("process exited with {status}")),
+        })
+        // wrap any prior error with the overall context that we couldn't run the command
+        .with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
+}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -10,6 +10,7 @@ pub mod http;
 pub mod logger;
 pub mod catalog;
 pub mod compute;
+pub mod disk_quota;
 pub mod extension_server;
 pub mod lsn_lease;
 mod migration;
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
            .max(valid_duration / 2);

        info!(
-            "Succeeded, sleeping for {} seconds",
+            "Request succeeded, sleeping for {} seconds",
            sleep_duration.as_secs()
        );
-        thread::sleep(sleep_duration);
+        compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
    }
 }

@@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry(
                .map(|connstr| {
                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        info!("Got storage auth token from spec file");
                        config.password(storage_auth_token.clone());
-                    } else {
-                        info!("Storage auth token not set");
                    }
                    config
                })
@@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry(
                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
            }
            Err(e) => {
-                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
+                warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");

-                thread::sleep(Duration::from_millis(retry_period_ms as u64));
+                compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
+                    retry_period_ms as u64,
+                ));
                retry_period_ms *= 1.5;
                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
            }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,6 +9,7 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
--- a/control_plane/src/branch_mappings.rs
+++ b/control_plane/src/branch_mappings.rs
@@ -0,0 +1,94 @@
+//! Branch mappings for convenience
+
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+
+use anyhow::{bail, Context};
+use serde::{Deserialize, Serialize};
+
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+
+/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
+#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct BranchMappings {
+    /// Default tenant ID to use with the 'neon_local' command line utility, when
+    /// --tenant_id is not explicitly specified. This comes from the branches.
+    pub default_tenant_id: Option<TenantId>,
+
+    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
+    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
+    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
+}
+
+impl BranchMappings {
+    pub fn register_branch_mapping(
+        &mut self,
+        branch_name: String,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        let existing_values = self.mappings.entry(branch_name.clone()).or_default();
+
+        let existing_ids = existing_values
+            .iter()
+            .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
+
+        if let Some((_, old_timeline_id)) = existing_ids {
+            if old_timeline_id == &timeline_id {
+                Ok(())
+            } else {
+                bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
+            }
+        } else {
+            existing_values.push((tenant_id, timeline_id));
+            Ok(())
+        }
+    }
+
+    pub fn get_branch_timeline_id(
+        &self,
+        branch_name: &str,
+        tenant_id: TenantId,
+    ) -> Option<TimelineId> {
+        // If it looks like a timeline ID, return it as it is
+        if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
+            return Some(timeline_id);
+        }
+
+        self.mappings
+            .get(branch_name)?
+            .iter()
+            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
+            .map(|&(_, timeline_id)| timeline_id)
+            .map(TimelineId::from)
+    }
+
+    pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
+        self.mappings
+            .iter()
+            .flat_map(|(name, tenant_timelines)| {
+                tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
+                    (TenantTimelineId::new(tenant_id, timeline_id), name.clone())
+                })
+            })
+            .collect()
+    }
+
+    pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
+        let content = &toml::to_string_pretty(self)?;
+        fs::write(path, content).with_context(|| {
+            format!(
+                "Failed to write branch information into path '{}'",
+                path.display()
+            )
+        })
+    }
+
+    pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
+        let branches_file_contents = fs::read_to_string(path)?;
+        Ok(toml::from_str(branches_file_contents.as_str())?)
+    }
+}
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -561,6 +561,7 @@ impl Endpoint {
            operation_uuid: None,
            features: self.features.clone(),
            swap_size_bytes: None,
+            disk_quota_bytes: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -113,7 +113,7 @@ impl SafekeeperNode {

    pub async fn start(
        &self,
-        extra_opts: Vec<String>,
+        extra_opts: &[String],
        retry_timeout: &Duration,
    ) -> anyhow::Result<()> {
        print!(
@@ -196,7 +196,7 @@ impl SafekeeperNode {
            ]);
        }

-        args.extend(extra_opts);
+        args.extend_from_slice(extra_opts);

        background_process::start_process(
            &format!("safekeeper-{id}"),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -347,7 +347,7 @@ impl StorageController {

            if !tokio::fs::try_exists(&pg_data_path).await? {
                let initdb_args = [
-                    "-D",
+                    "--pgdata",
                    pg_data_path.as_ref(),
                    "--username",
                    &username(),
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -50,6 +50,16 @@ pub struct ComputeSpec {
    #[serde(default)]
    pub swap_size_bytes: Option<u64>,

+    /// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs
+    /// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the
+    /// spec is first received.
+    ///
+    /// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's
+    /// spec generation doesn't need to be aware of the actual compute it's running on, while
+    /// guaranteeing gradual rollout of disk quota.
+    #[serde(default)]
+    pub disk_quota_bytes: Option<u64>,
+
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -268,6 +278,22 @@ pub struct GenericOption {
 /// declare a `trait` on it.
 pub type GenericOptions = Option<Vec<GenericOption>>;

+/// Configured the local-proxy application with the relevant JWKS and roles it should
+/// use for authorizing connect requests using JWT.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct LocalProxySpec {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct JwksSettings {
+    pub id: String,
+    pub role_names: Vec<String>,
+    pub jwks_url: String,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,7 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
+    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
    pub io_buffer_alignment: usize,
 }

@@ -381,7 +381,7 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
+            virtual_file_io_mode: None,

            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -994,50 +992,49 @@ pub mod virtual_file {
    }

    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    #[repr(u8)]
+    pub enum IoMode {
+        /// Uses buffered IO.
+        Buffered,
+        /// Uses direct IO, error out if the operation fails.
+        #[cfg(target_os = "linux")]
+        Direct,
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
+    impl IoMode {
+        pub const fn preferred() -> Self {
+            if cfg!(target_os = "linux") {
+                Self::Direct
+            } else {
+                Self::Buffered
+            }
+        }
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
+    impl TryFrom<u8> for IoMode {
+        type Error = u8;

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
+        fn try_from(value: u8) -> Result<Self, Self::Error> {
+            Ok(match value {
+                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
+                #[cfg(target_os = "linux")]
+                v if v == (IoMode::Direct as u8) => IoMode::Direct,
+                x => return Err(x),
+            })
+        }
    }
 }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String {
 }

 fn log_query_error(query: &str, e: &QueryError) {
+    // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`.
    match e {
        QueryError::Disconnected(ConnectionError::Io(io_error)) => {
            if is_expected_io_error(io_error) {
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -93,9 +93,9 @@ impl Conf {
        );
        let output = self
            .new_pg_command("initdb")?
-            .arg("-D")
+            .arg("--pgdata")
            .arg(&self.datadir)
-            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["--username", "postgres", "--no-instructions", "--no-sync"])
            .output()?;
        debug!("initdb output: {:?}", output);
        ensure!(
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,12 +164,10 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    page_cache::init(conf.page_cache_size);
+
+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(16384, virtual_file::io_engine_for_bench(), align);
+    page_cache::init(conf.page_cache_size, align);

    {
        let mut group = c.benchmark_group("ingest-small-values");
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -550,6 +550,19 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// Configs io mode at runtime.
+    pub async fn put_io_mode(
+        &self,
+        mode: &pageserver_api::models::virtual_file::IoMode,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, mode)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
@@ -736,4 +749,22 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
+
+    pub async fn timeline_init_lsn_lease(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, LsnLeaseRequest { lsn })
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -151,13 +151,10 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    pageserver::page_cache::init(100);
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    pageserver::page_cache::init(100, align);

    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,8 +59,9 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
-    page_cache::init(100);
+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    page_cache::init(100, align);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
@@ -190,12 +191,10 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
-            pageserver::page_cache::init(100);
+            let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+            pageserver::page_cache::init(100, align);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -205,12 +205,9 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    page_cache::init(100);
+    let align = DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    page_cache::init(100, align);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -63,6 +63,10 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_alignment: Option<usize>,

+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
+    #[clap(long)]
+    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
+
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -133,6 +137,10 @@ async fn main_impl(
        mgmt_api_client.put_io_alignment(align).await?;
    }

+    if let Some(mode) = &args.set_io_mode {
+        mgmt_api_client.put_io_mode(mode).await?;
+    }
+
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::config::PageserverIdentity;
-use pageserver::control_plane_client::ControlPlaneClient;
+use pageserver::controller_upcall_client::ControllerUpcallClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
@@ -125,7 +125,7 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
+    info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings");
    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    // The tenants directory contains all the pageserver local disk state.
@@ -173,7 +173,7 @@ fn main() -> anyhow::Result<()> {
        conf.virtual_file_io_engine,
        conf.io_buffer_alignment,
    );
-    page_cache::init(conf.page_cache_size);
+    page_cache::init(conf.page_cache_size, conf.io_buffer_alignment);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

@@ -396,7 +396,7 @@ fn start_pageserver(
    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
        remote_storage.clone(),
-        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        ControllerUpcallClient::new(conf, &shutdown_pageserver),
        conf,
    );
    if let Some(deletion_workers) = deletion_workers {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,7 +174,7 @@ pub struct PageServerConf {
    pub l0_flush: crate::l0_flush::L0FlushConfig,

    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
+    pub virtual_file_io_mode: virtual_file::IoMode,

    pub io_buffer_alignment: usize,
 }
@@ -325,7 +325,7 @@ impl PageServerConf {
            image_compression,
            ephemeral_bytes_per_memory_kb,
            l0_flush,
-            virtual_file_direct_io,
+            virtual_file_io_mode,
            concurrent_tenant_warmup,
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
@@ -368,7 +368,6 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
-            virtual_file_direct_io,
            io_buffer_alignment,

            // ------------------------------------------------------------
@@ -408,6 +407,7 @@ impl PageServerConf {
            l0_flush: l0_flush
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
+            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
 use pageserver_api::config::NodeMetadata;

-/// The Pageserver's client for using the control plane API: this is a small subset
-/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub struct ControlPlaneClient {
+/// The Pageserver's client for using the storage controller upcall API: this is a small API
+/// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
+///
+/// The server presenting this API may either be the storage controller or some other
+/// service (such as the Neon control plane) providing a store of generation numbers.
+pub struct ControllerUpcallClient {
    http_client: reqwest::Client,
    base_url: Url,
    node_id: NodeId,
@@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi {
    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
 }

-impl ControlPlaneClient {
+impl ControllerUpcallClient {
    /// A None return value indicates that the input `conf` object does not have control
    /// plane API enabled.
    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
@@ -114,7 +117,7 @@ impl ControlPlaneClient {
    }
 }

-impl ControlPlaneGenerationsApi for ControlPlaneClient {
+impl ControlPlaneGenerationsApi for ControllerUpcallClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn re_attach(
        &self,
@@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            .join("validate")
            .expect("Failed to build validate path");

-        let request = ValidateRequest {
-            tenants: tenants
-                .into_iter()
-                .map(|(id, gen)| ValidateRequestTenant {
-                    id,
-                    gen: gen
-                        .into()
-                        .expect("Generation should always be valid for a Tenant doing deletions"),
-                })
-                .collect(),
-        };
+        // When sending validate requests, break them up into chunks so that we
+        // avoid possible edge cases of generating any HTTP requests that
+        // require database I/O across many thousands of tenants.
+        let mut result: HashMap<TenantShardId, bool> = HashMap::with_capacity(tenants.len());
+        for tenant_chunk in (tenants).chunks(128) {
+            let request = ValidateRequest {
+                tenants: tenant_chunk
+                    .iter()
+                    .map(|(id, generation)| ValidateRequestTenant {
+                        id: *id,
+                        gen: (*generation).into().expect(
+                            "Generation should always be valid for a Tenant doing deletions",
+                        ),
+                    })
+                    .collect(),
+            };

-        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
-        if self.cancel.is_cancelled() {
-            return Err(RetryForeverError::ShuttingDown);
+            failpoint_support::sleep_millis_async!(
+                "control-plane-client-validate-sleep",
+                &self.cancel
+            );
+            if self.cancel.is_cancelled() {
+                return Err(RetryForeverError::ShuttingDown);
+            }
+
+            let response: ValidateResponse =
+                self.retry_http_forever(&re_attach_path, request).await?;
+            for rt in response.tenants {
+                result.insert(rt.id, rt.valid);
+            }
        }

-        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
-
-        Ok(response
-            .tenants
-            .into_iter()
-            .map(|rt| (rt.id, rt.valid))
-            .collect())
+        Ok(result.into_iter().collect())
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -6,7 +6,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;

-use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::controller_upcall_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
@@ -622,7 +622,7 @@ impl DeletionQueue {
    /// If remote_storage is None, then the returned workers will also be None.
    pub fn new<C>(
        remote_storage: GenericRemoteStorage,
-        control_plane_client: Option<C>,
+        controller_upcall_client: Option<C>,
        conf: &'static PageServerConf,
    ) -> (Self, Option<DeletionQueueWorkers<C>>)
    where
@@ -662,7 +662,7 @@ impl DeletionQueue {
                    conf,
                    backend_rx,
                    executor_tx,
-                    control_plane_client,
+                    controller_upcall_client,
                    lsn_table.clone(),
                    cancel.clone(),
                ),
@@ -704,7 +704,7 @@ mod test {
    use tokio::task::JoinHandle;

    use crate::{
-        control_plane_client::RetryForeverError,
+        controller_upcall_client::RetryForeverError,
        repository::Key,
        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
    };
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -25,8 +25,8 @@ use tracing::info;
 use tracing::warn;

 use crate::config::PageServerConf;
-use crate::control_plane_client::ControlPlaneGenerationsApi;
-use crate::control_plane_client::RetryForeverError;
+use crate::controller_upcall_client::ControlPlaneGenerationsApi;
+use crate::controller_upcall_client::RetryForeverError;
 use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;

@@ -61,7 +61,7 @@ where
    tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    // Client for calling into control plane API for validation of deletes
-    control_plane_client: Option<C>,
+    controller_upcall_client: Option<C>,

    // DeletionLists which are waiting generation validation.  Not safe to
    // execute until [`validate`] has processed them.
@@ -94,7 +94,7 @@ where
        conf: &'static PageServerConf,
        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        control_plane_client: Option<C>,
+        controller_upcall_client: Option<C>,
        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
        cancel: CancellationToken,
    ) -> Self {
@@ -102,7 +102,7 @@ where
            conf,
            rx,
            tx,
-            control_plane_client,
+            controller_upcall_client,
            lsn_table,
            pending_lists: Vec::new(),
            validated_lists: Vec::new(),
@@ -145,8 +145,8 @@ where
            return Ok(());
        }

-        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
-            match control_plane_client
+        let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
+            match controller_upcall_client
                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
                .await
            {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
@@ -824,7 +825,7 @@ async fn get_lsn_by_timestamp_handler(

    let lease = if with_lease {
        timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
+            .init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
            .inspect_err(|_| {
                warn!("fail to grant a lease to {}", lsn);
            })
@@ -1692,9 +1693,18 @@ async fn lsn_lease_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    let result = timeline
-        .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
-        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
+
+    let result = async {
+        timeline
+            .init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
+            .map_err(|e| {
+                ApiError::InternalServerError(
+                    e.context(format!("invalid lsn lease request at {lsn}")),
+                )
+            })
+    }
+    .instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await?;

    json_response(StatusCode::OK, result)
 }
@@ -2372,6 +2382,16 @@ async fn put_io_alignment_handler(
    json_response(StatusCode::OK, ())
 }

+async fn put_io_mode_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let mode: IoMode = json_request(&mut r).await?;
+    crate::virtual_file::set_io_mode(mode);
+    json_response(StatusCode::OK, ())
+}
+
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -3062,6 +3082,7 @@ pub fn make_router(
        .put("/v1/io_alignment", |r| {
            api_handler(r, put_io_alignment_handler)
        })
+        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -6,7 +6,7 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-pub mod control_plane_client;
+pub mod controller_upcall_client;
 pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,6 +8,8 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
+use postgres_backend::{is_expected_io_error, QueryError};
+use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
@@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
 pub(crate) struct BasebackupQueryTime {
    ok: Histogram,
    error: Histogram,
+    client_error: Histogram,
 }

 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
@@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
    BasebackupQueryTime {
        ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
        error: vec.get_metric_with_label_values(&["error"]).unwrap(),
+        client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
    }
 });

@@ -1557,7 +1561,7 @@ impl BasebackupQueryTime {
 }

 impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
-    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+    pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
        let elapsed = self.start.elapsed();
        let ex_throttled = self
            .ctx
@@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                elapsed
            }
        };
-        let metric = if res.is_ok() {
-            &self.parent.ok
-        } else {
-            &self.parent.error
+        // If you want to change categorize of a specific error, also change it in `log_query_error`.
+        let metric = match res {
+            Ok(_) => &self.parent.ok,
+            Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
+                if is_expected_io_error(io_error) =>
+            {
+                &self.parent.client_error
+            }
+            Err(_) => &self.parent.error,
        };
        metric.observe(ex_throttled.as_secs_f64());
    }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    virtual_file::{self, dio::IoBufferMut},
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -90,8 +91,8 @@ const TEST_PAGE_CACHE_SIZE: usize = 50;
 ///
 /// Initialize the page cache. This must be called once at page server startup.
 ///
-pub fn init(size: usize) {
-    if PAGE_CACHE.set(PageCache::new(size)).is_err() {
+pub fn init(size: usize, align: usize) {
+    if PAGE_CACHE.set(PageCache::new(size, align)).is_err() {
        panic!("page cache already initialized");
    }
 }
@@ -106,7 +107,12 @@ pub fn get() -> &'static PageCache {
    // page cache is usable in unit tests.
    //
    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
+        PAGE_CACHE.get_or_init(|| {
+            PageCache::new(
+                TEST_PAGE_CACHE_SIZE,
+                virtual_file::get_io_buffer_alignment(),
+            )
+        })
    } else {
        PAGE_CACHE.get().expect("page cache not initialized")
    }
@@ -637,13 +643,11 @@ impl PageCache {
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
+    fn new(num_pages: usize, align: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+        let page_buffer =
+            IoBufferMut::with_capacity_aligned_zeroed(num_pages * PAGE_SZ, align).leak();

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -273,10 +273,20 @@ async fn page_service_conn_main(
                info!("Postgres client disconnected ({io_error})");
                Ok(())
            } else {
-                Err(io_error).context("Postgres connection error")
+                let tenant_id = conn_handler.timeline_handles.tenant_id();
+                Err(io_error).context(format!(
+                    "Postgres connection error for tenant_id={:?} client at peer_addr={}",
+                    tenant_id, peer_addr
+                ))
            }
        }
-        other => other.context("Postgres query error"),
+        other => {
+            let tenant_id = conn_handler.timeline_handles.tenant_id();
+            other.context(format!(
+                "Postgres query error for tenant_id={:?} client peer_addr={}",
+                tenant_id, peer_addr
+            ))
+        }
    }
 }

@@ -340,6 +350,10 @@ impl TimelineHandles {
                }
            })
    }
+
+    fn tenant_id(&self) -> Option<TenantId> {
+        self.wrapper.tenant_id.get().copied()
+    }
 }

 pub(crate) struct TenantManagerWrapper {
@@ -819,7 +833,7 @@ impl PageServerHandler {
        set_tracing_field_shard_id(&timeline);

        let lease = timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
+            .renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
            .inspect_err(|e| {
                warn!("{e}");
            })
@@ -997,7 +1011,6 @@ impl PageServerHandler {
        )
        .await?;

-        tracing::info!("get_rel_page_at_lsn: {lsn}");
        let page = timeline
            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
            .await?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
@@ -182,27 +183,54 @@ pub struct TenantSharedResources {
 pub(super) struct AttachedTenantConf {
    tenant_conf: TenantConfOpt,
    location: AttachedLocationConfig,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<tokio::time::Instant>,
 }

 impl AttachedTenantConf {
    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
+        // Sets a deadline before which we cannot proceed to GC due to lsn lease.
+        //
+        // We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+        // length, we guarantee that all the leases we granted before will have a chance to renew
+        // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+        let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
+            Some(
+                tokio::time::Instant::now()
+                    + tenant_conf
+                        .lsn_lease_length
+                        .unwrap_or(LsnLease::DEFAULT_LENGTH),
+            )
+        } else {
+            // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
+            // because we don't do GC in these modes.
+            None
+        };
+
        Self {
            tenant_conf,
            location,
+            lsn_lease_deadline,
        }
    }

    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
-            LocationMode::Attached(attach_conf) => Ok(Self {
-                tenant_conf: location_conf.tenant_conf,
-                location: *attach_conf,
-            }),
+            LocationMode::Attached(attach_conf) => {
+                Ok(Self::new(location_conf.tenant_conf, *attach_conf))
+            }
            LocationMode::Secondary(_) => {
                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
            }
        }
    }
+
+    fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| tokio::time::Instant::now() < d)
+            .unwrap_or(false)
+    }
 }
 struct TimelinePreload {
    timeline_id: TimelineId,
@@ -1822,6 +1850,11 @@ impl Tenant {
                info!("Skipping GC in location state {:?}", conf.location);
                return Ok(GcResult::default());
            }
+
+            if conf.is_gc_blocked_by_lsn_lease_deadline() {
+                info!("Skipping GC because lsn lease deadline is not reached");
+                return Ok(GcResult::default());
+            }
        }

        let _guard = match self.gc_block.start().await {
@@ -2630,6 +2663,8 @@ impl Tenant {
            Arc::new(AttachedTenantConf {
                tenant_conf: new_tenant_conf.clone(),
                location: inner.location,
+                // Attached location is not changed, no need to update lsn lease deadline.
+                lsn_lease_deadline: inner.lsn_lease_deadline,
            })
        });

@@ -3887,9 +3922,9 @@ async fn run_initdb(
    let _permit = INIT_DB_SEMAPHORE.acquire().await;

    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
-        .args(["-D", initdb_target_dir.as_ref()])
-        .args(["-U", &conf.superuser])
-        .args(["-E", "utf8"])
+        .args(["--pgdata", initdb_target_dir.as_ref()])
+        .args(["--username", &conf.superuser])
+        .args(["--encoding", "utf8"])
        .arg("--no-instructions")
        .arg("--no-sync")
        .env_clear()
@@ -4461,13 +4496,17 @@ mod tests {
        tline.freeze_and_flush().await.map_err(|e| e.into())
    }

-    #[tokio::test]
+    #[tokio::test(start_paused = true)]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
                .await?
                .load()
                .await;
+        // Advance to the lsn lease deadline so that GC is not blocked by
+        // initial transition into AttachedSingle.
+        tokio::time::advance(tenant.get_lsn_lease_length()).await;
+        tokio::time::resume();
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -7244,9 +7283,17 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
+    #[tokio::test(start_paused = true)]
    async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
+            .await
+            .unwrap()
+            .load()
+            .await;
+        // Advance to the lsn lease deadline so that GC is not blocked by
+        // initial transition into AttachedSingle.
+        tokio::time::advance(tenant.get_lsn_lease_length()).await;
+        tokio::time::resume();
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let end_lsn = Lsn(0x100);
@@ -7274,24 +7321,33 @@ mod tests {

        let leased_lsns = [0x30, 0x50, 0x70];
        let mut leases = Vec::new();
-        let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
-            leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
-            Ok(())
+        leased_lsns.iter().for_each(|n| {
+            leases.push(
+                timeline
+                    .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
+                    .expect("lease request should succeed"),
+            );
        });

-        // Renewing with shorter lease should not change the lease.
-        let updated_lease_0 =
-            timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
-        assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
+        let updated_lease_0 = timeline
+            .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
+            .expect("lease renewal should succeed");
+        assert_eq!(
+            updated_lease_0.valid_until, leases[0].valid_until,
+            " Renewing with shorter lease should not change the lease."
+        );

-        // Renewing with a long lease should renew lease with later expiration time.
-        let updated_lease_1 = timeline.make_lsn_lease(
-            Lsn(leased_lsns[1]),
-            timeline.get_lsn_lease_length() * 2,
-            &ctx,
-        )?;
-
-        assert!(updated_lease_1.valid_until > leases[1].valid_until);
+        let updated_lease_1 = timeline
+            .renew_lsn_lease(
+                Lsn(leased_lsns[1]),
+                timeline.get_lsn_lease_length() * 2,
+                &ctx,
+            )
+            .expect("lease renewal should succeed");
+        assert!(
+            updated_lease_1.valid_until > leases[1].valid_until,
+            "Renewing with a long lease should renew lease with later expiration time."
+        );

        // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
        info!(
@@ -7308,7 +7364,8 @@ mod tests {
                &CancellationToken::new(),
                &ctx,
            )
-            .await?;
+            .await
+            .unwrap();

        // Keeping everything <= Lsn(0x80) b/c leases:
        // 0/10: initdb layer
@@ -7322,13 +7379,16 @@ mod tests {
        // Make lease on a already GC-ed LSN.
        // 0/80 does not have a valid lease + is below latest_gc_cutoff
        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
-        let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
-        assert!(res.is_err());
+        timeline
+            .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
+            .expect_err("lease request on GC-ed LSN should fail");

        // Should still be able to renew a currently valid lease
        // Assumption: original lease to is still valid for 0/50.
-        let _ =
-            timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
+        // (use `Timeline::init_lsn_lease` for testing so it always does validation)
+        timeline
+            .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
+            .expect("lease renewal with validation should succeed");

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let path = self.buffered_writer.as_inner().as_inner().path();
        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
@@ -356,7 +356,7 @@ mod tests {
        }

        let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
        assert_eq!(file_contents, &content[0..cap]);

        let buffer_contents = file.buffered_writer.inspect_buffer();
@@ -392,7 +392,7 @@ mod tests {
            .buffered_writer
            .as_inner()
            .as_inner()
-            .path
+            .path()
            .metadata()
            .unwrap();
        assert_eq!(
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,29 +1,12 @@
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;

-use super::remote_timeline_client::index::GcBlockingReason;
-use tokio::time::Instant;
 use utils::id::TimelineId;

-type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+use super::remote_timeline_client::index::GcBlockingReason;

-#[derive(Default)]
-struct Storage {
-    timelines_blocked: TimelinesBlocked,
-    /// The deadline before which we are blocked from GC so that
-    /// leases have a chance to be renewed.
-    lsn_lease_deadline: Option<Instant>,
-}
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;

-impl Storage {
-    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
-        self.lsn_lease_deadline
-            .map(|d| Instant::now() < d)
-            .unwrap_or(false)
-    }
-}
-
-/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
-/// blocking.
+/// GcBlock provides persistent (per-timeline) gc blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
    /// The timelines which have current reasons to block gc.
@@ -66,17 +49,6 @@ impl GcBlock {
        }
    }

-    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
-    ///
-    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
-    /// length, we guarantee that all the leases we granted before will have a chance to renew
-    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
-    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
-        let deadline = Instant::now() + lsn_lease_length;
-        let mut g = self.reasons.lock().unwrap();
-        g.lsn_lease_deadline = Some(deadline);
-    }
-
    /// Describe the current gc blocking reasons.
    ///
    /// TODO: make this json serializable.
@@ -102,7 +74,7 @@ impl GcBlock {
    ) -> anyhow::Result<bool> {
        let (added, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
+            let set = g.entry(timeline.timeline_id).or_default();
            let added = set.insert(reason);

            // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +105,7 @@ impl GcBlock {

        let (remaining_blocks, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            match g.timelines_blocked.entry(timeline.timeline_id) {
+            match g.entry(timeline.timeline_id) {
                Entry::Occupied(mut oe) => {
                    let set = oe.get_mut();
                    set.remove(reason);
@@ -147,7 +119,7 @@ impl GcBlock {
                }
            }

-            let remaining_blocks = g.timelines_blocked.len();
+            let remaining_blocks = g.len();

            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
            let uploaded = timeline
@@ -172,11 +144,11 @@ impl GcBlock {
    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
        let unblocked = {
            let mut g = self.reasons.lock().unwrap();
-            if g.timelines_blocked.is_empty() {
+            if g.is_empty() {
                return;
            }

-            g.timelines_blocked.remove(&timeline.timeline_id);
+            g.remove(&timeline.timeline_id);

            BlockingReasons::clean_and_summarize(g).is_none()
        };
@@ -187,11 +159,10 @@ impl GcBlock {
    }

    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
        let mut g = self.reasons.lock().unwrap();
-        assert!(g.timelines_blocked.is_empty());
-        g.timelines_blocked
-            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));

        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
            tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +176,6 @@ pub(super) struct Guard<'a> {

 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
-    tenant_blocked_by_lsn_lease_deadline: bool,
    timelines: usize,
    reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
-            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
        )
    }
 }
@@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
        let mut reasons = enumset::EnumSet::empty();
-        g.timelines_blocked.retain(|_key, value| {
+        g.retain(|_key, value| {
            reasons = reasons.union(*value);
            !value.is_empty()
        });
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
+        if !g.is_empty() {
            Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                reasons,
            })
        } else {
@@ -240,17 +208,14 @@ impl BlockingReasons {
    }

    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
+        if g.is_empty() {
            None
        } else {
            let reasons = g
-                .timelines_blocked
                .values()
                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
            Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                reasons,
            })
        }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -1470,52 +1470,4 @@ mod tests {
            LayerVisibilityHint::Visible
        ));
    }
-
-    /// Exercise edge case of querying at exactly the LSN of an image layer
-    #[test]
-    fn layer_search_at_image_lsn() {
-        let tenant_id = TenantId::generate();
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let timeline_id = TimelineId::generate();
-
-        let last_record_lsn = Lsn::from_hex("00000000DEADBEEF").unwrap();
-
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-
-        let image_layer = PersistentLayerDesc {
-            key_range: Key::from_i128(0)..Key::from_i128(i128::MAX),
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(last_record_lsn),
-            tenant_shard_id,
-            timeline_id,
-            is_delta: false,
-            file_size: 123,
-        };
-
-        let delta_layer = PersistentLayerDesc {
-            key_range: Key::from_i128(0)..Key::from_i128(i128::MAX),
-            lsn_range: Lsn(0)..Lsn(0xdead0000),
-            tenant_shard_id,
-            timeline_id,
-            is_delta: true,
-            file_size: 123,
-        };
-
-        updates.insert_historic(image_layer.clone());
-        updates.insert_historic(delta_layer);
-
-        updates.flush();
-
-        // FIXME: according to the search() docstring, it searches for layers with start LSNs _less then_
-        // `end_lsn` -- i.e. it's correct that if you ask for exactly the LSN of an image layer, it shouldn't hit
-        // it.  However, the way that page_service calls it is to take the last_record_lsn of a Timeline
-        // and pass that directly into LayerMap::search().
-
-        let searched = layer_map
-            .search(Key::from_i128(12345), last_record_lsn)
-            .unwrap();
-
-        // We searched at the LSN of the image layer: we should hit it
-        assert_eq!(searched.layer.as_ref(), &image_layer);
-    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::{
-    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+use crate::controller_upcall_client::{
+    ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
@@ -122,7 +122,7 @@ pub(crate) enum ShardSelector {
    Known(ShardIndex),
 }

-/// A convenience for use with the re_attach ControlPlaneClient function: rather
+/// A convenience for use with the re_attach ControllerUpcallClient function: rather
 /// than the serializable struct, we build this enum that encapsulates
 /// the invariant that attached tenants always have generations.
 ///
@@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
        + TEMP_FILE_SUFFIX;
    let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
    fs::rename(path.as_ref(), &tmp_path).await?;
-    fs::File::open(parent).await?.sync_all().await?;
+    fs::File::open(parent)
+        .await?
+        .sync_all()
+        .await
+        .maybe_fatal_err("safe_rename_tenant_dir")?;
    Ok(tmp_path)
 }

@@ -341,7 +345,7 @@ async fn init_load_generations(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
        );
        emergency_generations(tenant_confs)
-    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
+    } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach(conf).await {
@@ -949,12 +953,6 @@ impl TenantManager {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                    match attach_conf.generation.cmp(&tenant.generation) {
                        Ordering::Equal => {
-                            if attach_conf.attach_mode == AttachmentMode::Single {
-                                tenant
-                                    .gc_block
-                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
-                            }
-
                            // A transition from Attached to Attached in the same generation, we may
                            // take our fast path and just provide the updated configuration
                            // to the tenant.
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -178,6 +178,7 @@ async fn download_object<'a>(
                destination_file
                    .flush()
                    .await
+                    .maybe_fatal_err("download_object sync_all")
                    .with_context(|| format!("flush source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;

@@ -185,6 +186,7 @@ async fn download_object<'a>(
                destination_file
                    .sync_all()
                    .await
+                    .maybe_fatal_err("download_object sync_all")
                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;

@@ -232,6 +234,7 @@ async fn download_object<'a>(
                destination_file
                    .sync_all()
                    .await
+                    .maybe_fatal_err("download_object sync_all")
                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -433,7 +433,6 @@ impl ReadableLayer {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        tracing::info!("get_values_reconstruct_data: {:?}", self.id());
        match self {
            ReadableLayer::PersistentLayer(layer) => {
                layer
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -40,15 +40,15 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadCoalesceMode, VectoredReadPlanner,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -572,7 +572,7 @@ impl DeltaLayerWriterInner {
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path,
+            file.path(),
            metadata.len()
        );

@@ -589,7 +589,9 @@ impl DeltaLayerWriterInner {
        );

        // fsync the file
-        file.sync_all().await?;
+        file.sync_all()
+            .await
+            .maybe_fatal_err("delta_layer sync_all")?;

        trace!("created delta layer {}", self.path);

@@ -788,7 +790,7 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;

@@ -989,7 +991,8 @@ impl DeltaLayerInner {
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1008,7 +1011,7 @@ impl DeltaLayerInner {
                            blob_meta.key,
                            PageReconstructError::Other(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
@@ -1016,7 +1019,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

                    continue;
                }
@@ -1034,7 +1037,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to decompress blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1052,7 +1055,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1133,7 +1136,7 @@ impl DeltaLayerInner {
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
        use crate::tenant::vectored_blob_io::{
-            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+            BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
        };
        use futures::stream::TryStreamExt;

@@ -1183,15 +1186,15 @@ impl DeltaLayerInner {

        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

-        let mut read_builder: Option<VectoredReadBuilder> = None;
-        let read_mode = VectoredReadCoalesceMode::get();
+        let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;

        let max_read_size = self
            .max_vectored_read_bytes
            .map(|x| x.0.get())
            .unwrap_or(8192);

-        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buffer = Some(IoBufferMut::with_capacity_aligned(max_read_size, align));

        // FIXME: buffering of DeltaLayerWriter
        let mut per_blob_copy = Vec::new();
@@ -1228,12 +1231,12 @@ impl DeltaLayerInner {
                {
                    None
                } else {
-                    read_builder.replace(VectoredReadBuilder::new(
+                    read_builder.replace(ChunkedVectoredReadBuilder::new(
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
                        max_read_size,
-                        read_mode,
+                        align,
                    ))
                }
            } else {
@@ -1550,12 +1553,12 @@ impl<'a> DeltaLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let align = virtual_file::get_io_buffer_alignment();
+        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
        for meta in blobs_buf.blobs.iter() {
            let blob_read = meta.read(&view).await?;
            let value = Value::des(&blob_read)?;
@@ -1930,7 +1933,9 @@ pub(crate) mod test {
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
+
+            let align = virtual_file::get_io_buffer_alignment();
+            let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -40,11 +40,12 @@ use crate::tenant::vectored_blob_io::{
    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
@@ -388,7 +389,7 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
@@ -542,14 +543,15 @@ impl ImageLayerInner {
            .await?;

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let align = virtual_file::get_io_buffer_alignment();
        let mut key_count = 0;
        for read in plan.into_iter() {
            let buf_size = read.size();

-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-            let frozen_buf = blobs_buf.buf.freeze();
-            let view = BufView::new_bytes(frozen_buf);
+
+            let view = BufView::new_slice(&blobs_buf.buf);

            for meta in blobs_buf.blobs.iter() {
                let img_buf = meta.read(&view).await?;
@@ -597,13 +599,13 @@ impl ImageLayerInner {
                );
            }

-            let buf = BytesMut::with_capacity(buf_size);
+            let align = virtual_file::get_io_buffer_alignment();
+            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

            match res {
                Ok(blobs_buf) => {
-                    let frozen_buf = blobs_buf.buf.freeze();
-                    let view = BufView::new_bytes(frozen_buf);
+                    let view = BufView::new_slice(&blobs_buf.buf);
                    for meta in blobs_buf.blobs.iter() {
                        let img_buf = meta.read(&view).await;

@@ -614,7 +616,7 @@ impl ImageLayerInner {
                                    meta.meta.key,
                                    PageReconstructError::Other(anyhow!(e).context(format!(
                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path,
+                                        self.file.path(),
                                    ))),
                                );

@@ -635,7 +637,7 @@ impl ImageLayerInner {
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
@@ -889,7 +891,9 @@ impl ImageLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.

        // fsync the file
-        file.sync_all().await?;
+        file.sync_all()
+            .await
+            .maybe_fatal_err("image_layer sync_all")?;

        trace!("created image layer {}", self.path);

@@ -1037,12 +1041,12 @@ impl<'a> ImageLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let align = virtual_file::get_io_buffer_alignment();
+        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
        for meta in blobs_buf.blobs.iter() {
            let img_buf = meta.read(&view).await?;
            next_batch.push_back((
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);

        let mut first = true;
-        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,6 +66,7 @@ use std::{
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
+        config::AttachmentMode,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -1324,16 +1325,38 @@ impl Timeline {
        Ok(())
    }

-    /// Obtains a temporary lease blocking garbage collection for the given LSN.
-    ///
-    /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
-    /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
-    /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
-    /// the requesting lease.
-    pub(crate) fn make_lsn_lease(
+    /// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`.
+    pub(crate) fn init_lsn_lease(
        &self,
        lsn: Lsn,
        length: Duration,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        self.make_lsn_lease(lsn, length, true, ctx)
+    }
+
+    /// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period.
+    pub(crate) fn renew_lsn_lease(
+        &self,
+        lsn: Lsn,
+        length: Duration,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        self.make_lsn_lease(lsn, length, false, ctx)
+    }
+
+    /// Obtains a temporary lease blocking garbage collection for the given LSN.
+    ///
+    /// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error
+    /// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present.
+    ///
+    /// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease.
+    /// The returned lease is therefore the maximum between the existing lease and the requesting lease.
+    fn make_lsn_lease(
+        &self,
+        lsn: Lsn,
+        length: Duration,
+        init: bool,
        _ctx: &RequestContext,
    ) -> anyhow::Result<LsnLease> {
        let lease = {
@@ -1347,8 +1370,8 @@ impl Timeline {

            let entry = gc_info.leases.entry(lsn);

-            let lease = {
-                if let Entry::Occupied(mut occupied) = entry {
+            match entry {
+                Entry::Occupied(mut occupied) => {
                    let existing_lease = occupied.get_mut();
                    if valid_until > existing_lease.valid_until {
                        existing_lease.valid_until = valid_until;
@@ -1360,20 +1383,28 @@ impl Timeline {
                    }

                    existing_lease.clone()
-                } else {
-                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
-                    let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
-                    if lsn < *latest_gc_cutoff_lsn {
-                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                }
+                Entry::Vacant(vacant) => {
+                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
+                    // not blocked by the lsn lease deadline.
+                    let validate = {
+                        let conf = self.tenant_conf.load();
+                        conf.location.attach_mode == AttachmentMode::Single
+                            && !conf.is_gc_blocked_by_lsn_lease_deadline()
+                    };
+
+                    if init || validate {
+                        let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                        if lsn < *latest_gc_cutoff_lsn {
+                            bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                        }
                    }

                    let dt: DateTime<Utc> = valid_until.into();
                    info!("lease created, valid until {}", dt);
-                    entry.or_insert(LsnLease { valid_until }).clone()
+                    vacant.insert(LsnLease { valid_until }).clone()
                }
-            };
-
-            lease
+            }
        };

        Ok(lease)
@@ -1950,8 +1981,6 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
    }

-    // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
-    #[allow(unused)]
    pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -3856,21 +3885,21 @@ impl Timeline {
            )));
        }

-        // let distance = lsn.0 - partition_lsn.0;
-        // if *partition_lsn != Lsn(0)
-        //     && distance <= self.repartition_threshold
-        //     && !flags.contains(CompactFlags::ForceRepartition)
-        // {
-        //     debug!(
-        //         distance,
-        //         threshold = self.repartition_threshold,
-        //         "no repartitioning needed"
-        //     );
-        //     return Ok((
-        //         (dense_partition.clone(), sparse_partition.clone()),
-        //         *partition_lsn,
-        //     ));
-        // }
+        let distance = lsn.0 - partition_lsn.0;
+        if *partition_lsn != Lsn(0)
+            && distance <= self.repartition_threshold
+            && !flags.contains(CompactFlags::ForceRepartition)
+        {
+            debug!(
+                distance,
+                threshold = self.repartition_threshold,
+                "no repartitioning needed"
+            );
+            return Ok((
+                (dense_partition.clone(), sparse_partition.clone()),
+                *partition_lsn,
+            ));
+        }

        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
@@ -5779,7 +5808,6 @@ impl<'a> TimelineWriter<'a> {
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
-        tracing::info!("finish_write @ {new_lsn}");
        self.tl.finish_write(new_lsn);
    }

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -364,10 +364,6 @@ impl Timeline {
                // 3. Create new image layers for partitions that have been modified
                // "enough". Skip image layer creation if L0 compaction cannot keep up.
                if fully_compacted {
-                    tracing::info!(
-                        "create_image_layers @ {lsn} (latest {})",
-                        self.get_last_record_lsn()
-                    );
                    let image_layers = self
                        .create_image_layers(
                            &partitioning,
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -18,7 +18,7 @@
 use std::collections::BTreeMap;
 use std::ops::Deref;

-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;

 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::{self, VirtualFile};

 /// Metadata bundled with the start and end offset of a blob.
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
 /// Return type of [`VectoredBlobReader::read_blobs`]
 pub struct VectoredBlobsBuf {
    /// Buffer for all blobs in this read
-    pub buf: BytesMut,
+    pub buf: IoBufferMut,
    /// Offsets into the buffer and metadata for all blobs in this read
    pub blobs: Vec<VectoredBlob>,
 }
@@ -185,171 +186,7 @@ pub(crate) enum VectoredReadExtended {
    No,
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VectoredReadCoalesceMode {
-    /// Only coalesce exactly adjacent reads.
-    AdjacentOnly,
-    /// In addition to adjacent reads, also consider reads whose corresponding
-    /// `end` and `start` offsets reside at the same chunk.
-    Chunked(usize),
-}
-
-impl VectoredReadCoalesceMode {
-    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
-    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
-    pub(crate) fn get() -> Self {
-        let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == 0 {
-            VectoredReadCoalesceMode::AdjacentOnly
-        } else {
-            VectoredReadCoalesceMode::Chunked(align)
-        }
-    }
-}
-
-pub(crate) enum VectoredReadBuilder {
-    Adjacent(AdjacentVectoredReadBuilder),
-    Chunked(ChunkedVectoredReadBuilder),
-}
-
-impl VectoredReadBuilder {
-    fn new_impl(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        match mode {
-            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
-                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
-            ),
-            VectoredReadCoalesceMode::Chunked(chunk_size) => {
-                Self::Chunked(ChunkedVectoredReadBuilder::new(
-                    start_offset,
-                    end_offset,
-                    meta,
-                    max_read_size,
-                    chunk_size,
-                ))
-            }
-        }
-    }
-
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: usize,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
-    }
-
-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, mode)
-    }
-
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
-            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
-        }
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.build(),
-            VectoredReadBuilder::Chunked(builder) => builder.build(),
-        }
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.size(),
-            VectoredReadBuilder::Chunked(builder) => builder.size(),
-        }
-    }
-}
-
-pub(crate) struct AdjacentVectoredReadBuilder {
-    /// Start offset of the read.
-    start: u64,
-    // End offset of the read.
-    end: u64,
-    /// Start offset and metadata for each blob in this read
-    blobs_at: VecMap<u64, BlobMeta>,
-    max_read_size: Option<usize>,
-}
-
-impl AdjacentVectoredReadBuilder {
-    /// Start building a new vectored read.
-    ///
-    /// Note that by design, this does not check against reading more than `max_read_size` to
-    /// support reading larger blobs than the configuration value. The builder will be single use
-    /// however after that.
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-    ) -> Self {
-        let mut blobs_at = VecMap::default();
-        blobs_at
-            .append(start_offset, meta)
-            .expect("First insertion always succeeds");
-
-        Self {
-            start: start_offset,
-            end: end_offset,
-            blobs_at,
-            max_read_size,
-        }
-    }
-    /// Attempt to extend the current read with a new blob if the start
-    /// offset matches with the current end of the vectored read
-    /// and the resuting size is below the max read size
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        tracing::trace!(start, end, "trying to extend");
-        let size = (end - start) as usize;
-        let not_limited_by_max_read_size = {
-            if let Some(max_read_size) = self.max_read_size {
-                self.size() + size <= max_read_size
-            } else {
-                true
-            }
-        };
-
-        if self.end == start && not_limited_by_max_read_size {
-            self.end = end;
-            self.blobs_at
-                .append(start, meta)
-                .expect("LSNs are ordered within vectored reads");
-
-            return VectoredReadExtended::Yes;
-        }
-
-        VectoredReadExtended::No
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        (self.end - self.start) as usize
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        VectoredRead {
-            start: self.start,
-            end: self.end,
-            blobs_at: self.blobs_at,
-        }
-    }
-}
-
+/// A vectored read builder that tries to coalesce all reads that fits in a chunk.
 pub(crate) struct ChunkedVectoredReadBuilder {
    /// Start block number
    start_blk_no: usize,
@@ -373,7 +210,7 @@ impl ChunkedVectoredReadBuilder {
    /// Note that by design, this does not check against reading more than `max_read_size` to
    /// support reading larger blobs than the configuration value. The builder will be single use
    /// however after that.
-    pub(crate) fn new(
+    fn new_impl(
        start_offset: u64,
        end_offset: u64,
        meta: BlobMeta,
@@ -396,6 +233,25 @@ impl ChunkedVectoredReadBuilder {
        }
    }

+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+        align: usize,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
+    }
+
+    pub(crate) fn new_streaming(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        align: usize,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None, align)
+    }
+
    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
    ///
    /// The resulting size also must be below the max read size.
@@ -474,17 +330,17 @@ pub struct VectoredReadPlanner {

    max_read_size: usize,

-    mode: VectoredReadCoalesceMode,
+    align: usize,
 }

 impl VectoredReadPlanner {
    pub fn new(max_read_size: usize) -> Self {
-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
        Self {
            blobs: BTreeMap::new(),
            prev: None,
            max_read_size,
-            mode,
+            align,
        }
    }

@@ -545,7 +401,7 @@ impl VectoredReadPlanner {
    }

    pub fn finish(self) -> Vec<VectoredRead> {
-        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut current_read_builder: Option<ChunkedVectoredReadBuilder> = None;
        let mut reads = Vec::new();

        for (key, blobs_for_key) in self.blobs {
@@ -558,12 +414,12 @@ impl VectoredReadPlanner {
                };

                if extended == VectoredReadExtended::No {
-                    let next_read_builder = VectoredReadBuilder::new(
+                    let next_read_builder = ChunkedVectoredReadBuilder::new(
                        start_offset,
                        end_offset,
                        BlobMeta { key, lsn },
                        self.max_read_size,
-                        self.mode,
+                        self.align,
                    );

                    let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -605,7 +461,7 @@ impl<'a> VectoredBlobReader<'a> {
    pub async fn read_blobs(
        &self,
        read: &VectoredRead,
-        buf: BytesMut,
+        buf: IoBufferMut,
        ctx: &RequestContext,
    ) -> Result<VectoredBlobsBuf, std::io::Error> {
        assert!(read.size() > 0);
@@ -688,7 +544,7 @@ impl<'a> VectoredBlobReader<'a> {
 /// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
-    read_builder: Option<VectoredReadBuilder>,
+    read_builder: Option<ChunkedVectoredReadBuilder>,
    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64)>,
    /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
@@ -699,21 +555,21 @@ pub struct StreamingVectoredReadPlanner {
    /// Size of the current batch
    cnt: usize,

-    mode: VectoredReadCoalesceMode,
+    align: usize,
 }

 impl StreamingVectoredReadPlanner {
    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
        Self {
            read_builder: None,
            prev: None,
            max_cnt,
            max_read_size,
            cnt: 0,
-            mode,
+            align,
        }
    }

@@ -762,11 +618,11 @@ impl StreamingVectoredReadPlanner {
            }
            None => {
                self.read_builder = {
-                    Some(VectoredReadBuilder::new_streaming(
+                    Some(ChunkedVectoredReadBuilder::new_streaming(
                        start_offset,
                        end_offset,
                        BlobMeta { key, lsn },
-                        self.mode,
+                        self.align,
                    ))
                };
            }
@@ -1090,9 +946,10 @@ mod tests {

        // Multiply by two (compressed data might need more space), and add a few bytes for the header
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let mut buf = BytesMut::with_capacity(reserved_bytes);
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buf = IoBufferMut::with_capacity_aligned(reserved_bytes, align);

-        let mode = VectoredReadCoalesceMode::get();
+        let align = virtual_file::get_io_buffer_alignment();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
        let meta = BlobMeta {
            key: Key::MIN,
@@ -1104,7 +961,8 @@ mod tests {
            if idx + 1 == offsets.len() {
                continue;
            }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
+            let read_builder =
+                ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
            let read = read_builder.build();
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+#[cfg(target_os = "linux")]
+use std::os::unix::fs::OpenOptionsExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;

@@ -38,10 +40,11 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
+pub(crate) use api::IoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
+pub(crate) mod dio;

 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
@@ -53,6 +56,7 @@ pub(crate) mod owned_buffers_io {
    //! but for the time being we're proving out the primitives in the neon.git repo
    //! for faster iteration.

+    pub(crate) mod io_buf_aligned;
    pub(crate) mod io_buf_ext;
    pub(crate) mod slice;
    pub(crate) mod write;
@@ -61,6 +65,176 @@ pub(crate) mod owned_buffers_io {
    }
 }

+#[derive(Debug)]
+pub enum VirtualFile {
+    Buffered(VirtualFileInner),
+    Direct(VirtualFileInner),
+}
+
+impl VirtualFile {
+    fn inner(&self) -> &VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+
+    fn inner_mut(&mut self) -> &mut VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+
+    fn into_inner(self) -> VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+    /// Open a file in read-only mode. Like File::open.
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::open(path, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    /// Open a file in read-only mode. Like File::open.
+    ///
+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
+    pub async fn open_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
+    }
+
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::create(path, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    pub async fn create_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        VirtualFile::open_with_options_v2(
+            path.as_ref(),
+            OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
+        )
+        .await
+    }
+
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`.
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = match get_io_mode() {
+            IoMode::Buffered => {
+                let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+                Self::Buffered(file)
+            }
+            #[cfg(target_os = "linux")]
+            IoMode::Direct => {
+                let file = VirtualFileInner::open_with_options(
+                    path,
+                    open_options.custom_flags(nix::libc::O_DIRECT),
+                    ctx,
+                )
+                .await?;
+                Self::Direct(file)
+            }
+        };
+        Ok(file)
+    }
+
+    pub fn path(&self) -> &Utf8Path {
+        self.inner().path.as_path()
+    }
+
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
+        content: B,
+    ) -> std::io::Result<()> {
+        VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
+    }
+
+    pub async fn sync_all(&self) -> Result<(), Error> {
+        self.inner().sync_all().await
+    }
+
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        self.inner().sync_data().await
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        self.inner().metadata().await
+    }
+
+    pub fn remove(self) {
+        self.into_inner().remove();
+    }
+
+    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        self.inner_mut().seek(pos).await
+    }
+
+    pub async fn read_exact_at<Buf>(
+        &self,
+        slice: Slice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Slice<Buf>, Error>
+    where
+        Buf: IoBufMut + Send,
+    {
+        self.inner().read_exact_at(slice, offset, ctx).await
+    }
+
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        self.inner().read_exact_at_page(page, offset, ctx).await
+    }
+
+    pub async fn write_all_at<Buf: IoBuf + Send>(
+        &self,
+        buf: FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        self.inner().write_all_at(buf, offset, ctx).await
+    }
+
+    pub async fn write_all<Buf: IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        self.inner_mut().write_all(buf, ctx).await
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -77,7 +251,7 @@ pub(crate) mod owned_buffers_io {
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
 #[derive(Debug)]
-pub struct VirtualFile {
+pub struct VirtualFileInner {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
    /// belongs to a different VirtualFile.
@@ -350,12 +524,12 @@ macro_rules! with_file {
    }};
 }

-impl VirtualFile {
+impl VirtualFileInner {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

@@ -364,7 +538,7 @@ impl VirtualFile {
    pub async fn create<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(
            path.as_ref(),
            OpenOptions::new().write(true).create(true).truncate(true),
@@ -382,7 +556,7 @@ impl VirtualFile {
        path: P,
        open_options: &OpenOptions,
        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        let path_ref = path.as_ref();
        let path_str = path_ref.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -413,7 +587,7 @@ impl VirtualFile {
            open_options.open(path_ref.as_std_path()).await?
        });

-        // Strip all options other than read and write.
+        // Strip all options other than read and write (O_DIRECT).
        //
        // It would perhaps be nicer to check just for the read and write flags
        // explicitly, but OpenOptions doesn't contain any functions to read flags,
@@ -423,7 +597,7 @@ impl VirtualFile {
        reopen_options.create_new(false);
        reopen_options.truncate(false);

-        let vfile = VirtualFile {
+        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
            pos: 0,
            path: path_ref.to_path_buf(),
@@ -466,6 +640,7 @@ impl VirtualFile {
                &[]
            };
            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+                .maybe_fatal_err("crashsafe_overwrite")
        })
        .await
        .expect("blocking task is never aborted")
@@ -475,7 +650,7 @@ impl VirtualFile {
    pub async fn sync_all(&self) -> Result<(), Error> {
        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
            let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
-            res
+            res.maybe_fatal_err("sync_all")
        })
    }

@@ -483,7 +658,7 @@ impl VirtualFile {
    pub async fn sync_data(&self) -> Result<(), Error> {
        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
            let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
-            res
+            res.maybe_fatal_err("sync_data")
        })
    }

@@ -1033,6 +1208,21 @@ impl tokio_epoll_uring::IoFd for FileGuard {

 #[cfg(test)]
 impl VirtualFile {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+        self.inner().read_blk(blknum, ctx).await
+    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner_mut().read_to_end(buf, ctx).await
+    }
+}
+
+#[cfg(test)]
+impl VirtualFileInner {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -1066,7 +1256,7 @@ impl VirtualFile {
    }
 }

-impl Drop for VirtualFile {
+impl Drop for VirtualFileInner {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
        let handle = self.handle.get_mut();
@@ -1147,7 +1337,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize)
        panic!("virtual_file::init called twice");
    }
    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
+        panic!(
+            "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
+        );
    }
    io_engine::init(engine);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
@@ -1174,14 +1366,16 @@ fn get_open_files() -> &'static OpenFiles {

 static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);

-/// Returns true if `x` is zero or a power of two.
-fn is_zero_or_power_of_two(x: usize) -> bool {
-    (x == 0) || ((x & (x - 1)) == 0)
+/// Returns true if the alignment is a power of two and is greater or equal to 512.
+fn is_valid_io_buffer_alignment(align: usize) -> bool {
+    align.is_power_of_two() && align >= 512
 }

+/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
+/// not a power of two or less than 512 bytes.
 #[allow(unused)]
 pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_zero_or_power_of_two(align) {
+    if is_valid_io_buffer_alignment(align) {
        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
        Ok(())
    } else {
@@ -1189,19 +1383,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
    }
 }

-/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
+/// Gets the io buffer alignment.
 ///
-/// This function should be used to check the raw config value.
-pub(crate) fn get_io_buffer_alignment_raw() -> usize {
+/// This function should be used for getting the actual alignment value to use.
+pub(crate) fn get_io_buffer_alignment() -> usize {
    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);

    if cfg!(test) {
        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
        if let Some(test_align) = utils::env::var(env_var_name) {
-            if is_zero_or_power_of_two(test_align) {
+            if is_valid_io_buffer_alignment(test_align) {
                test_align
            } else {
-                panic!("IO buffer alignment ({test_align}) is not a power of two");
+                panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
            }
        } else {
            align
@@ -1211,14 +1405,15 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
    }
 }

-/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = get_io_buffer_alignment_raw();
-    align.max(1)
+static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+
+pub(crate) fn set_io_mode(mode: IoMode) {
+    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
 }

+pub(crate) fn get_io_mode() -> IoMode {
+    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
+}
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
@@ -1527,7 +1722,7 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(
+            let f = VirtualFileInner::open_with_options(
                &test_file_path,
                OpenOptions::new().read(true),
                &ctx,
@@ -1579,7 +1774,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1588,7 +1783,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1611,7 +1806,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();

--- a/pageserver/src/virtual_file/dio.rs
+++ b/pageserver/src/virtual_file/dio.rs
@@ -0,0 +1,410 @@
+#![allow(unused)]
+
+use core::slice;
+use std::{
+    alloc::{self, Layout},
+    cmp,
+    mem::{ManuallyDrop, MaybeUninit},
+    ops::{Deref, DerefMut},
+    ptr::{addr_of_mut, NonNull},
+};
+
+use bytes::buf::UninitSlice;
+
+struct IoBufferPtr(*mut u8);
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Send for IoBufferPtr {}
+
+/// An aligned buffer type used for I/O.
+pub struct IoBufferMut {
+    ptr: IoBufferPtr,
+    capacity: usize,
+    len: usize,
+    align: usize,
+}
+
+impl IoBufferMut {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity_aligned(capacity: usize, align: usize) -> Self {
+        let layout = Layout::from_size_align(capacity, align).expect("Invalid layout");
+
+        // SAFETY:  Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
+        let ptr = unsafe {
+            let ptr = alloc::alloc(layout);
+            if ptr.is_null() {
+                alloc::handle_alloc_error(layout);
+            }
+            IoBufferPtr(ptr)
+        };
+
+        IoBufferMut {
+            ptr,
+            capacity,
+            len: 0,
+            align,
+        }
+    }
+
+
+    /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
+    pub fn with_capacity_aligned_zeroed(capacity: usize, align: usize) -> Self {
+        use bytes::BufMut;
+        let mut buf = Self::with_capacity_aligned(capacity, align);
+        buf.put_bytes(0, capacity);
+        buf.len = capacity;
+        buf
+    }
+
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.align
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    unsafe fn set_len(&mut self, new_len: usize) {
+        debug_assert!(new_len <= self.capacity());
+        self.len = new_len;
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        self.ptr.0
+    }
+
+    #[inline]
+    fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.0
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        if additional > self.capacity() - self.len() {
+            self.reserve_inner(additional);
+        }
+    }
+
+    fn reserve_inner(&mut self, additional: usize) {
+        let Some(required_cap) = self.len().checked_add(additional) else {
+            capacity_overflow()
+        };
+
+        let old_capacity = self.capacity();
+        let align = self.align();
+        // This guarantees exponential growth. The doubling cannot overflow
+        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+        let cap = cmp::max(old_capacity * 2, required_cap);
+
+        if !is_valid_alloc(cap) {
+            capacity_overflow()
+        }
+        let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
+
+        let old_ptr = self.as_mut_ptr();
+
+        // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
+        // and we panics on null pointer.
+        let (ptr, cap) = unsafe {
+            let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
+            let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
+            if ptr.is_null() {
+                alloc::handle_alloc_error(new_layout);
+            }
+            (IoBufferPtr(ptr), cap)
+        };
+
+        self.ptr = ptr;
+        self.capacity = cap;
+    }
+
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        let mut buf = ManuallyDrop::new(self);
+        // SAFETY: leaking the buffer as intended.
+        unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
+    }
+}
+
+fn capacity_overflow() -> ! {
+    panic!("capacity overflow")
+}
+
+// We need to guarantee the following:
+// * We don't ever allocate `> isize::MAX` byte-size objects.
+// * We don't overflow `usize::MAX` and actually allocate too little.
+//
+// On 64-bit we just need to check for overflow since trying to allocate
+// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
+// an extra guard for this in case we're running on a platform which can use
+// all 4GB in user-space, e.g., PAE or x32.
+#[inline]
+fn is_valid_alloc(alloc_size: usize) -> bool {
+    !(usize::BITS < 64 && alloc_size > isize::MAX as usize)
+}
+
+impl Drop for IoBufferMut {
+    fn drop(&mut self) {
+        // SAFETY: memory was allocated with std::alloc::alloc with the same layout.
+        unsafe {
+            alloc::dealloc(
+                self.as_mut_ptr(),
+                Layout::from_size_align_unchecked(self.capacity, self.align),
+            )
+        }
+    }
+}
+
+impl Deref for IoBufferMut {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl DerefMut for IoBufferMut {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.as_mut_slice()
+    }
+}
+
+/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
+unsafe impl bytes::BufMut for IoBufferMut {
+    #[inline]
+    fn remaining_mut(&self) -> usize {
+        // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
+        // Thus, it can have at most `self.capacity` bytes.
+        self.capacity() - self.len()
+    }
+
+    // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
+    #[inline]
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        let len: usize = self.len();
+        let remaining = self.remaining_mut();
+
+        if remaining < cnt {
+            panic_advance(cnt, remaining);
+        }
+
+        // Addition will not overflow since the sum is at most the capacity.
+        self.set_len(len + cnt);
+    }
+
+    #[inline]
+    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
+        let cap = self.capacity();
+        let len = self.len();
+
+        // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
+        // valid for `cap - len` bytes. The subtraction will not underflow since
+        // `len <= cap`.
+        unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) }
+    }
+}
+
+/// Panic with a nice error message.
+#[cold]
+fn panic_advance(idx: usize, len: usize) -> ! {
+    panic!(
+        "advance out of bounds: the len is {} but advancing by {}",
+        len, idx
+    );
+}
+
+/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer,
+/// and the location remains stable even if [`Self`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for IoBufferMut {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.capacity()
+    }
+}
+
+// SAFETY: See above.
+unsafe impl tokio_epoll_uring::IoBufMut for IoBufferMut {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, init_len: usize) {
+        if self.len() < init_len {
+            self.set_len(init_len);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_with_capacity_aligned() {
+        const ALIGN: usize = 4 * 1024;
+        let v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+
+        let v = IoBufferMut::with_capacity_aligned(ALIGN / 2, ALIGN);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN / 2);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    fn test_with_capacity_aligned_zeroed() {
+        const ALIGN: usize = 4 * 1024;
+        let v = IoBufferMut::with_capacity_aligned_zeroed(ALIGN, ALIGN);
+        assert_eq!(v.len(), ALIGN);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+        assert_eq!(&v[..], &[0; ALIGN])
+    }
+
+    #[test]
+    fn test_reserve() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
+        let capacity = v.capacity();
+        v.reserve(capacity);
+        assert_eq!(v.capacity(), capacity);
+        let data = [b'a'; ALIGN];
+        v.put(&data[..]);
+        v.reserve(capacity);
+        assert!(v.capacity() >= capacity * 2);
+        assert_eq!(&v[..], &data[..]);
+        let capacity = v.capacity();
+        v.clear();
+        v.reserve(capacity);
+        assert_eq!(capacity, v.capacity());
+    }
+
+    #[test]
+    fn test_bytes_put() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            for _ in 0..4 {
+                v.put(&x[..]);
+            }
+            assert_eq!(v.len(), ALIGN * 4);
+            assert_eq!(v.capacity(), ALIGN * 4);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bytes_put_panic() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        let x = [b'a'; ALIGN];
+        for _ in 0..5 {
+            v.put_slice(&x[..]);
+        }
+    }
+
+    #[test]
+    fn test_io_buf_put_slice() {
+        use tokio_epoll_uring::BoundedBufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            v.put_slice(&x[..]);
+            assert_eq!(v.len(), ALIGN);
+            assert_eq!(v.capacity(), ALIGN);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -0,0 +1,9 @@
+#![allow(unused)]
+
+use tokio_epoll_uring::IoBufMut;
+
+use crate::virtual_file::dio::IoBufferMut;
+
+pub(crate) trait IoBufAlignedMut: IoBufMut {}
+
+impl IoBufAlignedMut for IoBufferMut {}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,5 +1,6 @@
 //! See [`FullSlice`].

+use crate::virtual_file::dio::IoBufferMut;
 use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
@@ -76,3 +77,4 @@ macro_rules! impl_io_buf_ext {
 impl_io_buf_ext!(Bytes);
 impl_io_buf_ext!(BytesMut);
 impl_io_buf_ext!(Vec<u8>);
+impl_io_buf_ext!(IoBufferMut);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1473,11 +1473,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 {
 	NeonWALReadResult res;

-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
+#if PG_MAJORVERSION_NUM >= 17
+	if (!sk->wp->config->syncSafekeepers)
+	{
+		Size	rbytes;
+		rbytes = WALReadFromBuffers(buf, startptr, count,
+									walprop_pg_get_timeline_id());
+
+		startptr += rbytes;
+		count -= rbytes;
+	}
+#endif
+
+	if (count == 0)
+	{
+		res = NEON_WALREAD_SUCCESS;
+	}
+	else
+	{
+		Assert(count > 0);
+
+		/* Now read the remaining WAL from the WAL file */
+		res = NeonWALRead(sk->xlogreader,
+						  buf,
+						  startptr,
+						  count,
+						  walprop_pg_get_timeline_id());
+	}

 	if (res == NEON_WALREAD_SUCCESS)
 	{
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,6 +24,7 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
+compute_api.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,6 +80,14 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
    fn get_allowed_ips_and_secret(
        &self,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
+    fn dyn_clone(&self) -> Box<dyn TestBackend>;
+}
+
+#[cfg(test)]
+impl Clone for Box<dyn TestBackend> {
+    fn clone(&self) -> Self {
+        TestBackend::dyn_clone(&**self)
+    }
 }

 impl std::fmt::Display for Backend<'_, (), ()> {
@@ -585,6 +593,14 @@ mod tests {
            ))
        }

+        async fn get_endpoint_jwks(
+            &self,
+            _ctx: &RequestMonitoring,
+            _endpoint: crate::EndpointId,
+        ) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
+            unimplemented!()
+        }
+
        async fn wake_compute(
            &self,
            _ctx: &RequestMonitoring,
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -12,7 +12,10 @@ use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;

-use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
+use crate::{
+    context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
+    RoleName,
+};

 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -27,7 +30,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
        &self,
        ctx: &RequestMonitoring,
        endpoint: EndpointId,
-        role_name: RoleName,
    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }

@@ -35,10 +37,11 @@ pub(crate) struct AuthRule {
    pub(crate) id: String,
    pub(crate) jwks_url: url::Url,
    pub(crate) audience: Option<String>,
+    pub(crate) role_names: Vec<RoleNameInt>,
 }

 #[derive(Default)]
-pub(crate) struct JwkCache {
+pub struct JwkCache {
    client: reqwest::Client,

    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
@@ -54,18 +57,28 @@ pub(crate) struct JwkCacheEntry {
 }

 impl JwkCacheEntry {
-    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
-        self.key_sets.values().find_map(|key_set| {
-            key_set
-                .find_key(key_id)
-                .map(|jwk| (jwk, key_set.audience.as_deref()))
-        })
+    fn find_jwk_and_audience(
+        &self,
+        key_id: &str,
+        role_name: &RoleName,
+    ) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets
+            .values()
+            // make sure our requested role has access to the key set
+            .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name))
+            // try and find the requested key-id in the key set
+            .find_map(|key_set| {
+                key_set
+                    .find_key(key_id)
+                    .map(|jwk| (jwk, key_set.audience.as_deref()))
+            })
    }
 }

 struct KeySet {
    jwks: jose_jwk::JwkSet,
    audience: Option<String>,
+    role_names: Vec<RoleNameInt>,
 }

 impl KeySet {
@@ -106,7 +119,6 @@ impl JwkCacheEntryLock {
        ctx: &RequestMonitoring,
        client: &reqwest::Client,
        endpoint: EndpointId,
-        role_name: RoleName,
        auth_rules: &F,
    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
        // double check that no one beat us to updating the cache.
@@ -119,11 +131,10 @@ impl JwkCacheEntryLock {
            }
        }

-        let rules = auth_rules
-            .fetch_auth_rules(ctx, endpoint, role_name)
-            .await?;
+        let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?;
        let mut key_sets =
            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
+
        // TODO(conrad): run concurrently
        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
        for rule in rules {
@@ -151,6 +162,7 @@ impl JwkCacheEntryLock {
                                KeySet {
                                    jwks,
                                    audience: rule.audience,
+                                    role_names: rule.role_names,
                                },
                            );
                        }
@@ -173,7 +185,6 @@ impl JwkCacheEntryLock {
        ctx: &RequestMonitoring,
        client: &reqwest::Client,
        endpoint: EndpointId,
-        role_name: RoleName,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
        let now = Instant::now();
@@ -183,9 +194,7 @@ impl JwkCacheEntryLock {
        let Some(cached) = guard else {
            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
        };

        let last_update = now.duration_since(cached.last_retrieved);
@@ -196,9 +205,7 @@ impl JwkCacheEntryLock {
            let permit = self.acquire_permit().await;

            // it's been too long since we checked the keys. wait for them to update.
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
        }

        // every 5 minutes we should spawn a job to eagerly update the token.
@@ -212,7 +219,7 @@ impl JwkCacheEntryLock {
                let ctx = ctx.clone();
                tokio::spawn(async move {
                    if let Err(e) = entry
-                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
+                        .renew_jwks(permit, &ctx, &client, endpoint, &fetch)
                        .await
                    {
                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
@@ -232,7 +239,7 @@ impl JwkCacheEntryLock {
        jwt: &str,
        client: &reqwest::Client,
        endpoint: EndpointId,
-        role_name: RoleName,
+        role_name: &RoleName,
        fetch: &F,
    ) -> Result<(), anyhow::Error> {
        // JWT compact form is defined to be
@@ -254,30 +261,26 @@ impl JwkCacheEntryLock {
        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;

-        ensure!(header.typ == "JWT");
+        ensure!(
+            header.typ == "JWT",
+            "Provided authentication token is not a valid JWT encoding"
+        );
        let kid = header.key_id.context("missing key id")?;

        let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
            .await?;

        // get the key from the JWKs if possible. If not, wait for the keys to update.
        let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid) {
+            match guard.find_jwk_and_audience(kid, role_name) {
                Some(jwk) => break jwk,
                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);

                    let permit = self.acquire_permit().await;
                    guard = self
-                        .renew_jwks(
-                            permit,
-                            ctx,
-                            client,
-                            endpoint.clone(),
-                            role_name.clone(),
-                            fetch,
-                        )
+                        .renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
                        .await?;
                }
                _ => {
@@ -320,11 +323,14 @@ impl JwkCacheEntryLock {
        let now = SystemTime::now();

        if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
        }

        if let Some(nbf) = payload.not_before {
-            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+            ensure!(
+                nbf < now + CLOCK_SKEW_LEEWAY,
+                "JWT token is not yet ready to use"
+            );
        }

        Ok(())
@@ -336,7 +342,7 @@ impl JwkCache {
        &self,
        ctx: &RequestMonitoring,
        endpoint: EndpointId,
-        role_name: RoleName,
+        role_name: &RoleName,
        fetch: &F,
        jwt: &str,
    ) -> Result<(), anyhow::Error> {
@@ -572,7 +578,7 @@ mod tests {
        format!("{header}.{body}")
    }

-    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+    fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String {
        use p256::ecdsa::{Signature, SigningKey};

        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
@@ -660,11 +666,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
        let (ec1, jwk3) = new_ec_jwk("3".into());
        let (ec2, jwk4) = new_ec_jwk("4".into());

-        let jwt1 = new_rsa_jwt("1".into(), rs1);
-        let jwt2 = new_rsa_jwt("2".into(), rs2);
-        let jwt3 = new_ec_jwt("3".into(), ec1);
-        let jwt4 = new_ec_jwt("4".into(), ec2);
-
        let foo_jwks = jose_jwk::JwkSet {
            keys: vec![jwk1, jwk3],
        };
@@ -706,47 +707,98 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
        let client = reqwest::Client::new();

        #[derive(Clone)]
-        struct Fetch(SocketAddr);
+        struct Fetch(SocketAddr, Vec<RoleNameInt>);

        impl FetchAuthRules for Fetch {
            async fn fetch_auth_rules(
                &self,
                _ctx: &RequestMonitoring,
                _endpoint: EndpointId,
-                _role_name: RoleName,
            ) -> anyhow::Result<Vec<AuthRule>> {
                Ok(vec![
                    AuthRule {
                        id: "foo".to_owned(),
                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
                        audience: None,
+                        role_names: self.1.clone(),
                    },
                    AuthRule {
                        id: "bar".to_owned(),
                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
                        audience: None,
+                        role_names: self.1.clone(),
                    },
                ])
            }
        }

-        let role_name = RoleName::from("user");
+        let role_name1 = RoleName::from("anonymous");
+        let role_name2 = RoleName::from("authenticated");
+
+        let fetch = Fetch(
+            addr,
+            vec![
+                RoleNameInt::from(&role_name1),
+                RoleNameInt::from(&role_name2),
+            ],
+        );
+
        let endpoint = EndpointId::from("ep");

        let jwk_cache = Arc::new(JwkCacheEntryLock::default());

-        for token in [jwt1, jwt2, jwt3, jwt4] {
-            jwk_cache
-                .check_jwt(
-                    &RequestMonitoring::test(),
-                    &token,
-                    &client,
-                    endpoint.clone(),
-                    role_name.clone(),
-                    &Fetch(addr),
-                )
-                .await
-                .unwrap();
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), &ec1);
+        let jwt4 = new_ec_jwt("4".into(), &ec2);
+
+        // had the wrong kid, therefore will have the wrong ecdsa signature
+        let bad_jwt = new_ec_jwt("3".into(), &ec2);
+        // this role_name is not accepted
+        let bad_role_name = RoleName::from("cloud_admin");
+
+        let err = jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                &bad_jwt,
+                &client,
+                endpoint.clone(),
+                &role_name1,
+                &fetch,
+            )
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("signature error"));
+
+        let err = jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                &jwt1,
+                &client,
+                endpoint.clone(),
+                &bad_role_name,
+                &fetch,
+            )
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("jwk not found"));
+
+        let tokens = [jwt1, jwt2, jwt3, jwt4];
+        let role_names = [role_name1, role_name2];
+        for role in &role_names {
+            for token in &tokens {
+                jwk_cache
+                    .check_jwt(
+                        &RequestMonitoring::test(),
+                        token,
+                        &client,
+                        endpoint.clone(),
+                        role,
+                        &fetch,
+                    )
+                    .await
+                    .unwrap();
+            }
        }
    }
 }
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, net::SocketAddr};
+use std::net::SocketAddr;

 use anyhow::Context;
 use arc_swap::ArcSwapOption;
@@ -10,8 +10,8 @@ use crate::{
        NodeInfo,
    },
    context::RequestMonitoring,
-    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    EndpointId, RoleName,
+    intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
+    EndpointId,
 };

 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -48,26 +48,17 @@ impl LocalBackend {
 #[derive(Clone, Copy)]
 pub(crate) struct StaticAuthRules;

-pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
-
-#[derive(Debug, Clone)]
-pub struct JwksRoleSettings {
-    pub roles: HashMap<RoleName, EndpointJwksResponse>,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
-}
+pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::const_empty();

 impl FetchAuthRules for StaticAuthRules {
    async fn fetch_auth_rules(
        &self,
        _ctx: &RequestMonitoring,
        _endpoint: EndpointId,
-        role_name: RoleName,
    ) -> anyhow::Result<Vec<AuthRule>> {
        let mappings = JWKS_ROLE_MAP.load();
        let role_mappings = mappings
            .as_deref()
-            .and_then(|m| m.roles.get(&role_name))
            .context("JWKs settings for this role were not configured")?;
        let mut rules = vec![];
        for setting in &role_mappings.jwks {
@@ -75,6 +66,7 @@ impl FetchAuthRules for StaticAuthRules {
                id: setting.id.clone(),
                jwks_url: setting.jwks_url.clone(),
                audience: setting.jwt_audience.clone(),
+                role_names: setting.role_names.clone(),
            });
        }

--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,34 +1,35 @@
-use std::{
-    net::SocketAddr,
-    path::{Path, PathBuf},
-    pin::pin,
-    sync::Arc,
-    time::Duration,
-};
+use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};

-use anyhow::{bail, ensure};
+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
-use futures::{future::Either, FutureExt};
+use futures::future::Either;
 use proxy::{
-    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    auth::backend::local::{LocalBackend, JWKS_ROLE_MAP},
    cancellation::CancellationHandlerMain,
    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
-    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    console::{
+        locks::ApiLocks,
+        messages::{EndpointJwksResponse, JwksSettings},
+    },
    http::health_server::AppMetrics,
+    intern::RoleNameInt,
    metrics::{Metrics, ThreadPoolMetrics},
    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
    scram::threadpool::ThreadPool,
    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+    RoleName,
 };

 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

 use clap::Parser;
-use tokio::{net::TcpListener, task::JoinSet};
+use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};

 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
@@ -72,9 +73,12 @@ struct LocalProxyCliArgs {
    /// Address of the postgres server
    #[clap(long, default_value = "127.0.0.1:5432")]
    compute: SocketAddr,
-    /// File address of the local proxy config file
+    /// Path of the local proxy config file
    #[clap(long, default_value = "./localproxy.json")]
-    config_path: PathBuf,
+    config_path: Utf8PathBuf,
+    /// Path of the local proxy PID file
+    #[clap(long, default_value = "./localproxy.pid")]
+    pid_path: Utf8PathBuf,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -126,6 +130,24 @@ async fn main() -> anyhow::Result<()> {
    let args = LocalProxyCliArgs::parse();
    let config = build_config(&args)?;

+    // before we bind to any ports, write the process ID to a file
+    // so that compute-ctl can find our process later
+    // in order to trigger the appropriate SIGHUP on config change.
+    //
+    // This also claims a "lock" that makes sure only one instance
+    // of local-proxy runs at a time.
+    let _process_guard = loop {
+        match pid_file::claim_for_current_process(&args.pid_path) {
+            Ok(guard) => break guard,
+            Err(e) => {
+                // compute-ctl might have tried to read the pid-file to let us
+                // know about some config change. We should try again.
+                error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    };
+
    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
    let http_listener = TcpListener::bind(args.http).await?;
    let shutdown = CancellationToken::new();
@@ -139,12 +161,30 @@ async fn main() -> anyhow::Result<()> {
        16,
    ));

-    refresh_config(args.config_path.clone()).await;
+    // write the process ID to a file so that compute-ctl can find our process later
+    // in order to trigger the appropriate SIGHUP on config change.
+    let pid = std::process::id();
+    info!("process running in PID {pid}");
+    std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?;

    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
-        refresh_config(args.config_path.clone()).map(Ok)
+
+    let refresh_config_notify = Arc::new(Notify::new());
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
+        let refresh_config_notify = Arc::clone(&refresh_config_notify);
+        move || {
+            refresh_config_notify.notify_one();
+        }
    }));
+
+    // trigger the first config load **after** setting up the signal hook
+    // to avoid the race condition where:
+    // 1. No config file registered when local-proxy starts up
+    // 2. The config file is written but the signal hook is not yet received
+    // 3. local-proxy completes startup but has no config loaded, despite there being a registerd config.
+    refresh_config_notify.notify_one();
+    tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
+
    maintenance_tasks.spawn(proxy::http::health_server::task_main(
        metrics_listener,
        AppMetrics {
@@ -245,81 +285,84 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
    })))
 }

-async fn refresh_config(path: PathBuf) {
-    match refresh_config_inner(&path).await {
-        Ok(()) => {}
-        Err(e) => {
-            error!(error=?e, ?path, "could not read config file");
+async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
+    loop {
+        rx.notified().await;
+
+        match refresh_config_inner(&path).await {
+            Ok(()) => {}
+            Err(e) => {
+                error!(error=?e, ?path, "could not read config file");
+            }
        }
    }
 }

-async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> {
    let bytes = tokio::fs::read(&path).await?;
-    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;

-    let mut settings = None;
+    let mut jwks_set = vec![];

-    for mapping in data.roles.values_mut() {
-        for jwks in &mut mapping.jwks {
-            ensure!(
-                jwks.jwks_url.has_authority()
-                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
-                "Invalid JWKS url. Must be HTTP",
-            );
+    for jwks in data.jwks {
+        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;

-            ensure!(
-                jwks.jwks_url
-                    .host()
-                    .is_some_and(|h| h != url::Host::Domain("")),
-                "Invalid JWKS url. No domain listed",
-            );
+        ensure!(
+            jwks_url.has_authority()
+                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
+            "Invalid JWKS url. Must be HTTP",
+        );

-            // clear username, password and ports
-            jwks.jwks_url.set_username("").expect(
+        ensure!(
+            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
+            "Invalid JWKS url. No domain listed",
+        );
+
+        // clear username, password and ports
+        jwks_url
+            .set_username("")
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        jwks_url
+            .set_password(None)
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        // local testing is hard if we need to have a specific restricted port
+        if cfg!(not(feature = "testing")) {
+            jwks_url.set_port(None).expect(
                "url can be a base and has a valid host and is not a file. should not error",
            );
-            jwks.jwks_url.set_password(None).expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-            // local testing is hard if we need to have a specific restricted port
-            if cfg!(not(feature = "testing")) {
-                jwks.jwks_url.set_port(None).expect(
-                    "url can be a base and has a valid host and is not a file. should not error",
-                );
-            }
-
-            // clear query params
-            jwks.jwks_url.set_fragment(None);
-            jwks.jwks_url.query_pairs_mut().clear().finish();
-
-            if jwks.jwks_url.scheme() != "https" {
-                // local testing is hard if we need to set up https support.
-                if cfg!(not(feature = "testing")) {
-                    jwks.jwks_url
-                        .set_scheme("https")
-                        .expect("should not error to set the scheme to https if it was http");
-                } else {
-                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
-                }
-            }
-
-            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
-            ensure!(
-                *pr == jwks.project_id,
-                "inconsistent project IDs configured"
-            );
-            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
        }
+
+        // clear query params
+        jwks_url.set_fragment(None);
+        jwks_url.query_pairs_mut().clear().finish();
+
+        if jwks_url.scheme() != "https" {
+            // local testing is hard if we need to set up https support.
+            if cfg!(not(feature = "testing")) {
+                jwks_url
+                    .set_scheme("https")
+                    .expect("should not error to set the scheme to https if it was http");
+            } else {
+                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
+            }
+        }
+
+        jwks_set.push(JwksSettings {
+            id: jwks.id,
+            jwks_url,
+            provider_name: jwks.provider_name,
+            jwt_audience: jwks.jwt_audience,
+            role_names: jwks
+                .role_names
+                .into_iter()
+                .map(RoleName::from)
+                .map(|s| RoleNameInt::from(&s))
+                .collect(),
+        })
    }

-    if let Some((project_id, branch_id)) = settings {
-        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
-            roles: data.roles,
-            project_id,
-            branch_id,
-        })));
-    }
+    info!("successfully loaded new config");
+    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));

    Ok(())
 }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> {
        proxy_listener,
        cancellation_token.clone(),
    ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
-        Ok(())
-    }));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));

    // the signal task cant ever succeed.
    // the main task can error, or can succeed on cancellation.
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -461,10 +461,7 @@ async fn main() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(
-        cancellation_token.clone(),
-        || async { Ok(()) },
-    ));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
    maintenance_tasks.spawn(http::health_server::task_main(
        http_listener,
        AppMetrics {
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,13 +1,11 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::fmt::{self, Display};

 use crate::auth::IpPattern;

-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
-use crate::RoleName;

 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -348,11 +346,6 @@ impl ColdStartInfo {
    }
 }

-#[derive(Debug, Deserialize, Clone)]
-pub struct JwksRoleMapping {
-    pub roles: HashMap<RoleName, EndpointJwksResponse>,
-}
-
 #[derive(Debug, Deserialize, Clone)]
 pub struct EndpointJwksResponse {
    pub jwks: Vec<JwksSettings>,
@@ -361,11 +354,10 @@ pub struct EndpointJwksResponse {
 #[derive(Debug, Deserialize, Clone)]
 pub struct JwksSettings {
    pub id: String,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
    pub jwks_url: url::Url,
    pub provider_name: String,
    pub jwt_audience: Option<String>,
+    pub role_names: Vec<RoleNameInt>,
 }

 #[cfg(test)]
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -5,7 +5,10 @@ pub mod neon;
 use super::messages::{ConsoleError, MetricsAuxInfo};
 use crate::{
    auth::{
-        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        backend::{
+            jwt::{AuthRule, FetchAuthRules},
+            ComputeCredentialKeys, ComputeUserInfo,
+        },
        IpPattern,
    },
    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
@@ -16,7 +19,7 @@ use crate::{
    intern::ProjectIdInt,
    metrics::ApiLockMetrics,
    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, EndpointId,
 };
 use dashmap::DashMap;
 use std::{hash::Hash, sync::Arc, time::Duration};
@@ -334,6 +337,12 @@ pub(crate) trait Api {
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;

+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>>;
+
    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
@@ -343,6 +352,7 @@ pub(crate) trait Api {
 }

 #[non_exhaustive]
+#[derive(Clone)]
 pub enum ConsoleBackend {
    /// Current Cloud API (V2).
    Console(neon::Api),
@@ -386,6 +396,20 @@ impl Api for ConsoleBackend {
        }
    }

+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        match self {
+            Self::Console(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::Postgres(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(test)]
+            Self::Test(_api) => Ok(vec![]),
+        }
+    }
+
    async fn wake_compute(
        &self,
        ctx: &RequestMonitoring,
@@ -552,3 +576,13 @@ impl WakeComputePermit {
        res
    }
 }
+
+impl FetchAuthRules for ConsoleBackend {
+    async fn fetch_auth_rules(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.get_endpoint_jwks(ctx, endpoint).await
+    }
+}
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,7 +4,9 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::context::RequestMonitoring;
+use crate::{
+    auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
+};
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
 use crate::{
@@ -118,6 +120,39 @@ impl Api {
        })
    }

+    async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
+        let (client, connection) =
+            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
+
+        let connection = tokio::spawn(connection);
+
+        let res = client.query(
+                "select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1",
+                &[&endpoint.as_str()],
+            )
+            .await?;
+
+        let mut rows = vec![];
+        for row in res {
+            rows.push(AuthRule {
+                id: row.get("id"),
+                jwks_url: url::Url::parse(row.get("jwks_url"))?,
+                audience: row.get("audience"),
+                role_names: row
+                    .get::<_, Vec<String>>("role_names")
+                    .into_iter()
+                    .map(RoleName::from)
+                    .map(|s| RoleNameInt::from(&s))
+                    .collect(),
+            });
+        }
+
+        drop(client);
+        connection.await??;
+
+        Ok(rows)
+    }
+
    async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
        let mut config = compute::ConnCfg::new();
        config
@@ -185,6 +220,14 @@ impl super::Api for Api {
        ))
    }

+    async fn get_endpoint_jwks(
+        &self,
+        _ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.do_get_endpoint_jwks(endpoint).await
+    }
+
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,27 +7,33 @@ use super::{
    NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo,
+    auth::backend::{jwt::AuthRule, ComputeUserInfo},
    compute,
-    console::messages::{ColdStartInfo, Reason},
+    console::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, EndpointId,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
+use ::http::{header::AUTHORIZATION, HeaderName};
+use anyhow::bail;
 use futures::TryFutureExt;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{debug, error, info, info_span, warn, Instrument};

+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
 pub struct Api {
    endpoint: http::Endpoint,
    pub caches: &'static ApiCaches,
    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    jwt: String,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
 }

 impl Api {
@@ -38,7 +44,9 @@ impl Api {
        locks: &'static ApiLocks<EndpointCacheKey>,
        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
    ) -> Self {
-        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
+            .unwrap_or_default()
+            .into();
        Self {
            endpoint,
            caches,
@@ -71,9 +79,9 @@ impl Api {
        async {
            let request = self
                .endpoint
-                .get("proxy_get_role_secret")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .get_path("proxy_get_role_secret")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
                .query(&[("session_id", ctx.session_id())])
                .query(&[
                    ("application_name", application_name.as_str()),
@@ -125,6 +133,61 @@ impl Api {
        .await
    }

+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+            .await
+        {
+            bail!("endpoint not found");
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()?;
+
+            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .map_err(crate::error::log_error)
+        .instrument(info_span!("http", id = request_id))
+        .await
+    }
+
    async fn do_wake_compute(
        &self,
        ctx: &RequestMonitoring,
@@ -135,7 +198,7 @@ impl Api {
        async {
            let mut request_builder = self
                .endpoint
-                .get("proxy_wake_compute")
+                .get_path("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", ctx.session_id())])
@@ -262,6 +325,15 @@ impl super::Api for Api {
        ))
    }

+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> anyhow::Result<Vec<AuthRule>> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -86,9 +86,17 @@ impl Endpoint {

    /// Return a [builder](RequestBuilder) for a `GET` request,
    /// appending a single `path` segment to the base endpoint URL.
-    pub(crate) fn get(&self, path: &str) -> RequestBuilder {
+    pub(crate) fn get_path(&self, path: &str) -> RequestBuilder {
+        self.get_with_url(|u| {
+            u.path_segments_mut().push(path);
+        })
+    }
+
+    /// Return a [builder](RequestBuilder) for a `GET` request,
+    /// accepting a closure to modify the url path segments for more complex paths queries.
+    pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
        let mut url = self.endpoint.clone();
-        url.path_segments_mut().push(path);
+        f(&mut url);
        self.client.get(url.into_inner())
    }

@@ -144,7 +152,7 @@ mod tests {

        // Validate that this pattern makes sense.
        let req = endpoint
-            .get("frobnicate")
+            .get_path("frobnicate")
            .query(&[
                ("foo", Some("10")), // should be just `foo=10`
                ("bar", None),       // shouldn't be passed at all
@@ -162,7 +170,7 @@ mod tests {
        let endpoint = Endpoint::new(url, Client::new());

        let req = endpoint
-            .get("frobnicate")
+            .get_path("frobnicate")
            .query(&[("session_id", uuid::Uuid::nil())])
            .build()?;

--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -130,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
 }

 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub(crate) struct RoleNameTag;
+pub struct RoleNameTag;
 impl InternId for RoleNameTag {
    fn get_interner() -> &'static StringInterner<Self> {
        static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
        ROLE_NAMES.get_or_init(Default::default)
    }
 }
-pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
+pub type RoleNameInt = InternedString<RoleNameTag>;
 impl From<&RoleName> for RoleNameInt {
    fn from(value: &RoleName) -> Self {
        RoleNameTag::get_interner().get_or_intern(value)
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -82,7 +82,7 @@
    impl_trait_overcaptures,
 )]

-use std::{convert::Infallible, future::Future};
+use std::convert::Infallible;

 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -117,13 +117,12 @@ pub mod usage_metrics;
 pub mod waiters;

 /// Handle unix signals appropriately.
-pub async fn handle_signals<F, Fut>(
+pub async fn handle_signals<F>(
    token: CancellationToken,
    mut refresh_config: F,
 ) -> anyhow::Result<Infallible>
 where
-    F: FnMut() -> Fut,
-    Fut: Future<Output = anyhow::Result<()>>,
+    F: FnMut(),
 {
    use tokio::signal::unix::{signal, SignalKind};

@@ -136,7 +135,7 @@ where
            // Hangup is commonly used for config reload.
            _ = hangup.recv() => {
                warn!("received SIGHUP");
-                refresh_config().await?;
+                refresh_config();
            }
            // Shut down the whole application.
            _ = interrupt.recv() => {
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -525,6 +525,10 @@ impl TestBackend for TestConnectMechanism {
    {
        unimplemented!("not used in tests")
    }
+
+    fn dyn_clone(&self) -> Box<dyn TestBackend> {
+        Box::new(self.clone())
+    }
 }

 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -43,6 +43,13 @@ impl ThreadPool {
    pub fn new(n_workers: u8) -> Arc<Self> {
        // rayon would be nice here, but yielding in rayon does not work well afaict.

+        if n_workers == 0 {
+            return Arc::new(Self {
+                runtime: None,
+                metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+            });
+        }
+
        Arc::new_cyclic(|pool| {
            let pool = pool.clone();
            let worker_id = AtomicUsize::new(0);
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -119,7 +119,7 @@ impl PoolingBackend {
                    .check_jwt(
                        ctx,
                        user_info.endpoint.clone(),
-                        user_info.user.clone(),
+                        &user_info.user,
                        &StaticAuthRules,
                        jwt,
                    )
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -374,14 +374,16 @@ type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;

 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // fsync the datadir to make sure we have a consistent state on disk.
-    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
-    let started = Instant::now();
-    utils::crashsafe::syncfs(dfd)?;
-    let elapsed = started.elapsed();
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "syncfs data directory done"
-    );
+    if !conf.no_sync {
+        let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
+        let started = Instant::now();
+        utils::crashsafe::syncfs(dfd)?;
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "syncfs data directory done"
+        );
+    }

    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
--- a/safekeeper/tests/random_test.rs
+++ b/safekeeper/tests/random_test.rs
@@ -9,7 +9,7 @@ use crate::walproposer_sim::{

 pub mod walproposer_sim;

-// Generates 2000 random seeds and runs a schedule for each of them.
+// Generates 500 random seeds and runs a schedule for each of them.
 // If you see this test fail, please report the last seed to the
 // @safekeeper team.
 #[test]
@@ -17,7 +17,7 @@ fn test_random_schedules() -> anyhow::Result<()> {
    let clock = init_logger();
    let mut config = TestConfig::new(Some(clock));

-    for _ in 0..2000 {
+    for _ in 0..500 {
        let seed: u64 = rand::thread_rng().gen();
        config.network = generate_network_opts(seed);

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -401,6 +401,7 @@ class NeonEnvBuilder:
        safekeeper_extra_opts: Optional[list[str]] = None,
        storage_controller_port_override: Optional[int] = None,
        pageserver_io_buffer_alignment: Optional[int] = None,
+        pageserver_virtual_file_io_mode: Optional[str] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -455,6 +456,7 @@ class NeonEnvBuilder:
        self.storage_controller_port_override = storage_controller_port_override

        self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
+        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

        assert test_name.startswith(
            "test_"
@@ -1028,6 +1030,7 @@ class NeonEnv:
        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
        self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
+        self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode

        # Create the neon_local's `NeonLocalInitConf`
        cfg: Dict[str, Any] = {
@@ -1091,7 +1094,10 @@ class NeonEnv:
                        for key, value in override.items():
                            ps_cfg[key] = value

-            ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
+            if self.pageserver_io_buffer_alignment is not None:
+                ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
+            if self.pageserver_virtual_file_io_mode is not None:
+                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode

            # Create a corresponding NeonPageserver object
            self.pageservers.append(
@@ -1330,6 +1336,7 @@ def neon_simple_env(
    pageserver_aux_file_policy: Optional[AuxFileStore],
    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
    pageserver_io_buffer_alignment: Optional[int],
+    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
    """
    Simple Neon environment, with no authentication and no safekeepers.
@@ -1356,6 +1363,7 @@ def neon_simple_env(
        pageserver_aux_file_policy=pageserver_aux_file_policy,
        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
+        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
    ) as builder:
        env = builder.init_start()

@@ -1380,6 +1388,7 @@ def neon_env_builder(
    pageserver_aux_file_policy: Optional[AuxFileStore],
    record_property: Callable[[str, object], None],
    pageserver_io_buffer_alignment: Optional[int],
+    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1415,6 +1424,7 @@ def neon_env_builder(
        pageserver_aux_file_policy=pageserver_aux_file_policy,
        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
+        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
    ) as builder:
        yield builder
        # Propogate `preserve_database_files` to make it possible to use in other fixtures,
@@ -3311,7 +3321,7 @@ class VanillaPostgres(PgProtocol):
        self.pg_bin = pg_bin
        self.running = False
        if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
+            self.pg_bin.run_capture(["initdb", "--pgdata", str(pgdatadir)])
        self.configure([f"port = {port}\n"])

    def enable_tls(self):
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -39,6 +39,11 @@ def pageserver_io_buffer_alignment() -> Optional[int]:
    return None


+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_mode() -> Optional[str]:
+    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
+
+
@pytest.fixture(scope="function", autouse=True)
 def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
    return None
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -8,6 +8,7 @@ import requests
 from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
+from fixtures.utils import wait_until


 # Walreceiver as returned by sk's timeline status endpoint.
@@ -161,6 +162,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
            walreceivers=walreceivers,
        )

+    # Get timeline_start_lsn, waiting until it's nonzero. It is a way to ensure
+    # that the timeline is fully initialized at the safekeeper.
+    def get_non_zero_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        def timeline_start_lsn_non_zero() -> Lsn:
+            s = self.timeline_status(tenant_id, timeline_id).timeline_start_lsn
+            assert s > Lsn(0)
+            return s
+
+        return wait_until(30, 1, timeline_start_lsn_non_zero)
+
    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
        return self.timeline_status(tenant_id, timeline_id).commit_lsn

--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -56,32 +56,20 @@ class Workload:
            with ENDPOINT_LOCK:
                self._endpoint.reconfigure()

-    def go_readonly(self):
-        self.stop()
-        self._endpoint = self.make_endpoint(readonly=True, pageserver_id=None)
-        self._endpoint.start(pageserver_id=None)
-
-    def make_endpoint(self, readonly: bool, pageserver_id: Optional[int] = None) -> Endpoint:
+    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
        # We may be running alongside other Workloads for different tenants.  Full TTID is
        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"

-        if readonly:
-            self._endpoint_opts["hot_standby"] = True
-
-        return self.env.endpoints.create(
-            self.branch_name,
-            tenant_id=self.tenant_id,
-            pageserver_id=pageserver_id,
-            endpoint_id=endpoint_id,
-            **self._endpoint_opts,
-        )
-
-    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
        with ENDPOINT_LOCK:
            if self._endpoint is None:
-                self._endpoint = self.make_endpoint(pageserver_id=pageserver_id, readonly=False)
-
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                    **self._endpoint_opts,
+                )
                self._endpoint.start(pageserver_id=pageserver_id)
            else:
                self._endpoint.reconfigure(pageserver_id=pageserver_id)
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -11,7 +11,6 @@ from fixtures.neon_fixtures import (
    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.utils import wait_until
 from fixtures.workload import Workload

@@ -413,32 +412,3 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
                f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
            )
            assert res[0][0] == 1
-
-
-def test_image_layer_reads(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    workload = Workload(env, tenant_id, timeline_id)
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
-
-    # wait_for_wal_insert_lsn(env, workload._endpoint, tenant_id, timeline_id)
-
-    workload.go_readonly()
-
-    commit_lsn = env.safekeepers[0].http_client().get_commit_lsn(tenant_id, timeline_id)
-    wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, commit_lsn)
-    log.info(f"Ingested up to commit_lsn {commit_lsn}")
-
-    env.pageserver.http_client().timeline_compact(
-        tenant_id, timeline_id, force_image_layer_creation=True
-    )
-
-    # This should send getpage requests at the same LSN where we just created image layers
-    workload.validate()
-
-    # Nothing should have written in the meantime
-    assert commit_lsn == env.safekeepers[0].http_client().get_commit_lsn(tenant_id, timeline_id)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload

@@ -156,9 +156,6 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
-@skip_on_postgres(
-    PgVersion.V17, "There are no snapshots yet"
-)  # TODO: revert this once we have snapshots
 def test_backward_compatibility(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
@@ -206,9 +203,6 @@ def test_backward_compatibility(
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
-@skip_on_postgres(
-    PgVersion.V17, "There are no snapshots yet"
-)  # TODO: revert this once we have snapshots
 def test_forward_compatibility(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -27,7 +27,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
    env.pageserver.allowed_errors.extend(
        [
            ".*basebackup .* failed: invalid basebackup lsn.*",
-            ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected",
+            ".*/lsn_lease.*invalid lsn lease request.*",
        ]
    )

@@ -108,7 +108,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
    assert cur.fetchone() == (1,)

    # Create node at pre-initdb lsn
-    with pytest.raises(Exception, match="invalid basebackup lsn"):
+    with pytest.raises(Exception, match="invalid lsn lease request"):
        # compute node startup with invalid LSN should fail
        env.endpoints.create_start(
            branch_name="main",
@@ -167,6 +167,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            )
        return last_flush_lsn

+    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
+        """
+        Trigger GC manually on all pageservers. Then run an `SELECT` query.
+        """
+        for shard, ps in tenant_get_shards(env, env.initial_tenant):
+            client = ps.http_client()
+            gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+            log.info(f"{gc_result=}")
+
+            assert (
+                gc_result["layers_removed"] == 0
+            ), "No layers should be removed, old layers are guarded by leases."
+
+        with ep_static.cursor() as cur:
+            cur.execute("SELECT count(*) FROM t0")
+            assert cur.fetchone() == (ROW_COUNT,)
+
    # Insert some records on main branch
    with env.endpoints.create_start("main") as ep_main:
        with ep_main.cursor() as cur:
@@ -193,25 +210,31 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):

            generate_updates_on_main(env, ep_main, i, end=100)

-            # Trigger GC
-            for shard, ps in tenant_get_shards(env, env.initial_tenant):
-                client = ps.http_client()
-                gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
-                log.info(f"{gc_result=}")
+            trigger_gc_and_select(env, ep_static)

-                assert (
-                    gc_result["layers_removed"] == 0
-                ), "No layers should be removed, old layers are guarded by leases."
+            # Trigger Pageserver restarts
+            for ps in env.pageservers:
+                ps.stop()
+                # Static compute should have at least one lease request failure due to connection.
+                time.sleep(LSN_LEASE_LENGTH / 2)
+                ps.start()

-            with ep_static.cursor() as cur:
-                cur.execute("SELECT count(*) FROM t0")
-                assert cur.fetchone() == (ROW_COUNT,)
+            trigger_gc_and_select(env, ep_static)
+
+            # Reconfigure pageservers
+            env.pageservers[0].stop()
+            env.storage_controller.node_configure(
+                env.pageservers[0].id, {"availability": "Offline"}
+            )
+            env.storage_controller.reconcile_until_idle()
+
+            trigger_gc_and_select(env, ep_static)

        # Do some update so we can increment latest_gc_cutoff
        generate_updates_on_main(env, ep_main, i, end=100)

    # Wait for the existing lease to expire.
-    time.sleep(LSN_LEASE_LENGTH)
+    time.sleep(LSN_LEASE_LENGTH + 1)
    # Now trigger GC again, layers should be removed.
    for shard, ps in tenant_get_shards(env, env.initial_tenant):
        client = ps.http_client()
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -45,10 +45,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
    tenant_after = http.tenant_status(env.initial_tenant)
    assert tenant_before != tenant_after
    gc_blocking = tenant_after["gc_blocking"]
-    assert (
-        gc_blocking
-        == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
-    )
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"

    wait_for_another_gc_round()
    pss.assert_log_contains(gc_skipped_line)
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -772,7 +772,7 @@ class ProposerPostgres(PgProtocol):
    def initdb(self):
        """Run initdb"""

-        args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()]
+        args = ["initdb", "--username", "cloud_admin", "--pgdata", self.pg_data_dir_path()]
        self.pg_bin.run(args)

    def start(self):
@@ -2084,8 +2084,13 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):

    endpoint.safe_psql("create table t(key int, value text)")

-    timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
-    timeline_start_lsn = timeline_status.timeline_start_lsn
+    # Note: currently timelines on sks are created by compute and commit of
+    # transaction above is finished when 2/3 sks received it, so there is a
+    # small chance that timeline on this sk is not created/initialized yet,
+    # hence the usage of waiting function to prevent flakiness.
+    timeline_start_lsn = (
+        env.safekeepers[0].http_client().get_non_zero_timeline_start_lsn(tenant_id, timeline_id)
+    )
    log.info(f"Timeline start LSN: {timeline_start_lsn}")

    current_percent = 0.0
Author	SHA1	Message	Date
Yuchen Liang	e565c4fbe9	Merge branch 'yuchen/direct-io-aligned-alloc' into yuchen/direct-io-aligned-alloc-usage-wip	2024-10-07 13:08:55 -04:00
Yuchen Liang	a46757b769	add with_capacity_aligned_zeroed and leak Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-07 13:06:00 -04:00
Yuchen Liang	9c32bfee3b	fix put_io_mode to use the correct http endpoint Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 16:47:36 -04:00
Yuchen Liang	69ef8caf58	simplify virtual file wrapper Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:31:55 +00:00
Yuchen Liang	b7443dd643	add set_io_mode option to getpage_latest_lsn Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:16:42 +00:00
Yuchen Liang	cc433c76a3	fix clippy Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:05:01 +00:00
Yuchen Liang	2034ec906a	remove unused imports Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:04:11 +00:00
Yuchen Liang	f48ab8bcaa	use O_DIRECT as preferred Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 03:57:58 +00:00
Yuchen Liang	2607a57990	pageserver: add direct io config to virtual file Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 03:57:13 +00:00
Yuchen Liang	f04c1c230c	incr len in with_capacity_aligned_zeroed add a test Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 23:47:00 +00:00
Yuchen Liang	13f1931a09	fix build Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:18:29 +00:00
Yuchen Liang	e98a4eb5e2	add safety comments Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	e01d145066	remove example; add with_capacity_aligned_zeroed Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	9e9d76d6f2	use IoBufferMut for pagecache Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	14ec379d2b	enable O_DIRECT for delta and image layers Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	ebfe88a463	use IoBufferMut for delta and image layers Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	eb16aa9e81	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-29 16:59:27 -04:00
Alexander Bayandin	d2d9921761	CI(benchmarking): fix Nightly Benchmarks (#9178 ) ## Problem Nightly Benchmarks have been broken for some time due to various reasons, this PR fixes it ## Summary of changes - Pull `build-tools` image from dockerhub for `benchmarking` workflow - Use `aws-actions/configure-aws-credentials` to upload/download artifacts from S3 - Fix Postgres 16 installation (for pgbench)	2024-09-28 02:44:22 +01:00
Arthur Petukhovsky	ba498a630a	Set disk quotas on bind in compute_ctl (#8936 ) Part of https://github.com/neondatabase/cloud/issues/13127. Resolves #9153 What changed in this PR: 1. Adds `ComputeSpec.disk_quota_bytes: Option<u64>` 2. Adds new arg to compute_ctl: `--set-disk-quota-for-fs <mountpoint>` 3. Implements running `/neonvm/bin/set-disk-quota` with the right value if both cmdline arg AND field in the spec are specified 4. Patches `/etc/sudoers.d` to allow `compute_ctl` to set quota with sudo This PR is very similar to the swap support added earlier, you can take a look at it as prior art: #7434 In theory, it can be implemented outside of compute_ctl when we will have a separate neonvm daemon, but we are not there yet. Current implementation is the simplest possible to unblock computes with larger disks. All code related to usage of `/neonvm/bin/set-disk-quota` is located in `disk_quota.rs`. We need to call this script with the following arguments: `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`. Quotas are set on the filesystem level, so we need to provide path to the directory that filesystem was mounted to. I tested this change locally with https://github.com/neondatabase/cloud/pull/17270. It should be safe to merge, because this feature is gated by both cmdline arg and field in the spec. If control-plane doesn't set values in both places, compute_ctl won't be affected by this change.	2024-09-27 20:52:22 +01:00
Heikki Linnakangas	e989a5e4a2	neon_local: Use clap derive macros to parse the CLI args (#9103 ) This is easier to work with.	2024-09-27 22:08:46 +03:00
Alex Chi Z.	cde1654d7b	fix(pageserver): abort process if fsync fails (#9108 ) close https://github.com/neondatabase/neon/issues/8140 The original issue is rather vague on what we should do. After discussion w/ @problame we decided to narrow down the problems we want to solve in that issue. * read path -- do not panic for now. * write path -- panic only on write errors (i.e., device error, fsync error), but not on no-space for now. The guideline is that if the pageserver behavior could lead to violation of persistent constraints (i.e., return an operation as successful but not actually persisting things), we should panic. Fsync is the place where both of us agree that we should panic, because if fsync fails, the kernel will mark dirty pages as clean, and the next fsync will not necessarily return false. This would make the storage client assume the operation is successful. ## Summary of changes Make fsync panic on fatal errors. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-09-27 19:58:50 +01:00
Heikki Linnakangas	cf6a776fcf	tests: Reduce the # of iterations in safekeeper::test_random_schedules (#9182 ) To make it faster. On my laptop, it takes about 30 before this commit. In the arm64 debug variant in CI, it takes about 120 s. Reduce it by factor of 4.	2024-09-27 16:25:35 +00:00
Matthias van de Meent	5c5871111a	WalProposer: Read WAL directly from WAL buffers in PG17 (#9171 ) This reduces the overhead of the WalProposer when it is not being throttled by SK WAL acceptance rate	2024-09-27 17:47:05 +02:00
Yuchen Liang	d56c4e7a38	pageserver: remove AdjacentVectoredReadBuilder and bump minmimum io_buffer_alignment to 512 (#9175 ) Part of #8130 ## Problem After deploying https://github.com/neondatabase/infra/pull/1927, we shipped `io_buffer_alignment=512` to all prod region. The `AdjacentVectoredReadBuilder` code path is no longer taken and we are running pageserver unit tests 6 times in the CI. Removing it would reduce the test duration by 30-60s. ## Summary of changes - Remove `AdjacentVectoredReadBuilder` code. - Bump the minimum `io_buffer_alignment` requirement to at least 512 bytes. - Use default `io_buffer_alignment` for Rust unit tests. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-27 16:41:42 +01:00
Conrad Ludgate	43b2445d0b	proxy: add jwks endpoint to control plane and mock providers (#9165 )	2024-09-27 16:08:43 +01:00
Yuchen Liang	42ef08db47	fix(pageserver): LSN lease edge cases around restarts/migrations (#9055 ) Part of #7497, closes #8817. ## Problem See #8817. ## Summary of changes compute_ctl - Renew lsn lease as soon as `/configure` updates pageserver_connstr, use `state_changed` Condvar for synchronization. pageserver As mentioned in https://github.com/neondatabase/neon/issues/8817#issuecomment-2315768076, we still want some permanent error reported if a lease cannot be granted. By considering attachment mode and the added `lsn_lease_deadline` when processing lease requests, we can also bound the case of bad requests to a very short period after migration/restart. - Refactor https://github.com/neondatabase/neon/pull/9024 and move `lsn_lease_deadline` to `AttachedTenantConf` so timeline can easily access it. - Have separate HTTP `init_lsn_lease` and libpq `renew_lsn_lease` API. - Always do LSN verification for the initial HTTP lease request. - LSN verification for the renewal is still done when tenants are not in `AttachedSingle` and we have pass the `lsn_lease_deadline`, which give plenty of time for compute to renew the lease. neon_local - add and call `timeline_init_lsn_lease` mgmt_api at static endpoint start. The initial lsn lease http request is sent when we run `cargo neon endpoint start <static endpoint>`. ## Testing - Extend `test_readonly_node_gc` to do pageserver restarts and migration. ## Future Work - The control plane should make the initial lease request through HTTP when creating a static endpoint. This is currently only done in `neon_local`. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-27 09:56:52 -04:00
Tristan Partin	fc962c9605	Use long options when calling initdb Verbosity in this case is good when reading the code. Short options are better when operating in an interactive shell. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-09-27 08:22:16 -05:00
Heikki Linnakangas	357fa070a3	Add gdb to build-tools (#9125 ) So that compute_ctl can use it to print backtrace on core dumps See issue #2800.	2024-09-27 15:36:24 +03:00
Heikki Linnakangas	02cdd37b56	Dump backtrace if a core dump is called just "core" (#9125 ) I hope this lets us capture backtraces in CI. At least it makes it work on my laptop, which is valuable even if we need to do more for CI. See issue #2800.	2024-09-27 15:36:24 +03:00
Vlad Lazar	fa354a65ab	libs: improve logging on PG connection errors (#9130 ) ## Problem We get some unexpected errors, but don't know who they're happening for. ## Summary of change Add tenant id and peer address to PG connection error logs. Related https://github.com/neondatabase/cloud/issues/17336	2024-09-27 12:36:43 +01:00
Arseny Sher	40f7930a7d	safekeeper: skip syncfs on start if --no-sync is specified. (#9166 ) https://neondb.slack.com/archives/C059ZC138NR/p1727350911890989?thread_ts=1727350211.370869&cid=C059ZC138NR	2024-09-27 09:59:38 +03:00
Conrad Ludgate	ec07a1ecc9	proxy: make local-proxy config by signal with PID, refine JWKS apis with role caching (#9164 )	2024-09-26 19:01:48 +01:00
Arseny Sher	c4cdfe66ac	Fix flakiness of test_timeline_copy. Timeline might be not initialized when timeline_start_lsn is queried. Spotted by CI.	2024-09-26 19:01:45 +03:00
Alex Chi Z.	42e19e952f	fix(pageserver): categorize client error in basebackup metrics (#9110 ) We separated client error from basebackup error log lines in https://github.com/neondatabase/neon/pull/7523, but we didn't do anything for the metrics. In this patch, we fixed it. ref https://github.com/neondatabase/neon/issues/8970 ## Summary of changes We use the same criteria as in `log_query_error` producing an info line (instead of error) for the metrics. We added a `client_error` category for the basebackup query time metrics. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-09-26 11:38:19 -04:00
John Spray	3d255d601b	pageserver: rename control plane client & chunk validation requests (#8997 ) ## Problem - In https://github.com/neondatabase/neon/pull/8784, the validate controller API is modified to check generations directly in the database. It batches tenants into separate queries to avoid generating a huge statement, but - While updating this, I realized that "control_plane_client" is a kind of confusing name for the client code now that it primarily talks to the storage controller (the case of talking to the control plane will go away in a few months). ## Summary of changes - Big rename to "ControllerUpcallClient" -- this reflects the storage controller's api naming, where the paths used by the pageserver are in `/upcall/` - When sending validate requests, break them up into chunks so that we avoid possible edge cases of generating any HTTP requests that require database I/O across many thousands of tenants. This PR mixes a functional change with a refactor, but the commits are cleanly separated -- only the last commit is a functional change. --------- Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-09-26 16:06:34 +01:00
Arthur Petukhovsky	80e974d05b	fix(compute_ctl): race condition in configurator (#9162 ) There was a tricky race condition in compute_ctl, that sometimes makes configurator skip updates. It makes a deadlock because: - control-plane cannot configure compute, because it's in ConfigurationPending state - compute_ctl doesn't do any reconfiguration because `configurator_main_loop` missed notification for it Full sequence that reproduces the issue: 1. `start_compute` finishes works and changes status `self.set_status(ComputeStatus::Running);` 2. configurator received update about `Running` state and dropped the mutex lock in the iteration 3. `/configure` request was triggered at the same time as step 1, and got the mutex lock 4. same `/configure` request set the spec and updated the state to `ConfigurationPending`, also sent a notification 5. next iteration in configurator got the mutex lock, but missed the notification There are more details in this slack thread: https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ --------- Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>	2024-09-26 15:42:17 +01:00
Alexander Bayandin	7fdf1ab5b6	CI: run compatibility tests on Postgres 17 (#9145 ) ## Problem The latest storage release has generated artifacts for Postgres 17, so we can enable compatibility tests this version ## Summary of changes - Unskip `test_backward_compatibility` / `test_forward_compatibility` on Postgres 17	2024-09-26 15:17:01 +01:00
Yuchen Liang	f6d0ed6454	implement reserve for IoBufferMut Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-26 14:12:49 +00:00
Yuchen Liang	a2be8a440b	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-24 21:29:33 -04:00
Yuchen Liang	ff4a1db223	make sure we can Send IoBufferMut Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 23:52:34 -04:00
Yuchen Liang	29d54ccd20	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-03 11:41:49 -04:00
Yuchen Liang	68a1fe20f2	review: use doc comments to reference struct in safety comment Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-09-03 11:41:02 -04:00
Yuchen Liang	e8408c797a	remove unused comments Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 11:38:58 -04:00
Yuchen Liang	027f28deb9	remove Vec dependency Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 11:38:00 -04:00
Yuchen Liang	ea6f9798c6	add safety comment Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-16 18:06:16 +00:00
Yuchen Liang	253e4d5843	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-08-16 13:14:20 -04:00
Yuchen Liang	852099bc83	remove aligned-vec, use ManuallyDrop<Vec<u8>> Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-16 17:13:30 +00:00
Yuchen Liang	148e230d11	add mut version marker trait Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-15 04:03:13 +00:00
Yuchen Liang	6d664788c1	feat(pageserver): newtype aligned-vec as aligned buffer allocation Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-15 03:43:58 +00:00