Merge branch 'yuchen/direct-io-aligned-alloc' into yuchen/direct-io-aligned-alloc-usage-wip

add with_capacity_aligned_zeroed and leak
Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2026-05-25 00:50:36 +00:00 · 2024-10-07 13:08:55 -04:00 · 2024-10-07 13:06:00 -04:00 · 2024-10-01 16:47:36 -04:00 · 2024-10-01 12:31:55 +00:00 · 2024-10-01 12:16:42 +00:00
261 changed files with 3556 additions and 5414 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -557,7 +557,7 @@ jobs:
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib

-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
        export LD_LIBRARY_PATH
        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,7 +341,7 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          SYNC_BETWEEN_TESTS: true
+          SYNC_AFTER_EACH_TEST: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -773,7 +773,7 @@ jobs:
      matrix:
        version: [ v14, v15, v16, v17 ]
    env:
-      VM_BUILDER_VERSION: v0.35.0
+      VM_BUILDER_VERSION: v0.29.3

    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -102,17 +102,12 @@ jobs:
          # Default set of platforms to run e2e tests on
          platforms='["docker", "k8s"]'

-          # If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms.
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
          # If the workflow run is not a pull request, add k8s-neonvm to the list.
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
-                # List of directories that contain code which affect compute images.
-                #
-                # This isn't exhaustive, just the paths that are most directly compute-related.
-                # For example, compute_ctl also depends on libs/utils, but we don't trigger
-                # an e2e run on that.
-                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
+                vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,15 +53,15 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
-aws-sdk-s3 = "1.52"
-aws-sdk-iam = "1.46.0"
+aws-config = { version = "1.3", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.26"
+aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.2"
+aws-smithy-types = "1.1.9"
 aws-credential-types = "1.2.0"
-aws-sigv4 = { version = "1.2", features = ["sign-http"] }
-aws-types = "1.3"
-axum = { version = "0.7.5", features = ["ws"] }
+aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
+aws-types = "1.2.0"
+axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
@@ -96,13 +96,10 @@ hmac = "0.12.1"
 hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
-http-body-util = "0.1.2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
-hyper0 = { package = "hyper", version = "0.14" }
-hyper = "1.4"
-hyper-util = "0.1"
-tokio-tungstenite = "0.21.0"
+hyper = "0.14"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
 ipnet = "2.9.0"
@@ -119,10 +116,9 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.24"
-opentelemetry_sdk = "0.24"
-opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.16"
+opentelemetry = "0.20.0"
+opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -130,12 +126,12 @@ pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.13"
+prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
 reqwest-middleware = "0.3.0"
 reqwest-retry = "0.5"
 routerify = "3"
@@ -178,11 +174,11 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
+tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
-tracing-error = "0.2"
-tracing-opentelemetry = "0.25"
+tracing-error = "0.2.0"
+tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
@@ -246,7 +242,7 @@ criterion = "0.5.1"
 rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.12"
+tonic-build = "0.9"

 [patch.crates-io]

--- a/2
+++ b/2
@@ -91,7 +91,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
 RUN mkdir -p /data/.neon/ && \
  echo "id=1234" > "/data/.neon/identity.toml" && \
-  echo "broker_endpoint='http://storage_broker:30051'\n" \
+  echo "broker_endpoint='http://storage_broker:50051'\n" \
       "pg_distrib_dir='/usr/local/'\n" \
       "listen_pg_addr='0.0.0.0:6400'\n" \
       "listen_http_addr='0.0.0.0:9898'\n" \
--- a/14
+++ b/14
@@ -168,27 +168,27 @@ postgres-check-%: postgres-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
 	+@echo "Compiling neon_walredo $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
 	+@echo "Compiling neon_rmgr $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
 	+@echo "Compiling neon_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install

@@ -220,7 +220,7 @@ neon-pg-clean-ext-%:
 walproposer-lib: neon-pg-ext-v17
 	+@echo "Compiling walproposer-lib"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
 	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
@@ -333,7 +333,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 # Indent pxgn/neon.
 .PHONY: neon-pgindent
 neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
 		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison icu4c pkg-config m4
+brew install protobuf openssl flex bison icu4c pkg-config

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
@@ -145,7 +145,7 @@ Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"

 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:30051.
+Starting neon broker at 127.0.0.1:50051.
 storage_broker started, pid: 2918372
 Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
 pageserver started, pid: 2918386
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -12,25 +12,10 @@ ARG DEBIAN_FLAVOR=bullseye-slim
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR AS build-deps
 ARG DEBIAN_FLAVOR
-
-RUN case $DEBIAN_FLAVOR in \
-      # Version-specific installs for Bullseye (PG14-PG16):
-      # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
-      # Install newer version (3.25) from backports.
-      bullseye*) \
-        echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
-        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
-      ;; \
-      # Version-specific installs for Bookworm (PG17):
-      bookworm*) \
-        VERSION_INSTALLS="cmake"; \
-      ;; \
-    esac && \
-    apt update &&  \
-    apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
-    $VERSION_INSTALLS
+RUN apt update &&  \
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd

 #########################################################################################
 #
@@ -104,7 +89,7 @@ FROM build-deps AS postgis-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
    protobuf-c-compiler xsltproc
@@ -182,7 +167,7 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    apt update && \
-    apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
+    apt install -y ninja-build python3-dev libncurses5 binutils clang

 RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -215,6 +200,27 @@ FROM build-deps AS h3-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "$(uname -m)" in \
+      "x86_64") \
+        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
+        ;; \
+      "aarch64") \
+        export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
+        ;; \
+      *) \
+        echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
+      -q -O /tmp/cmake-install.sh \
+      && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
+
 RUN case "${PG_VERSION}" in "v17") \
        mkdir -p /h3/usr/ && \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -500,6 +506,8 @@ RUN case "${PG_VERSION}" in "v17") \
        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
        ;; \
    esac && \
+    apt-get update && \
+    apt-get install -y cmake && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
@@ -587,7 +595,8 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    apt-get update && \
-    apt-get install --no-install-recommends -y \
+    apt-get install -y \
+        cmake \
        libboost-iostreams1.74-dev \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
@@ -752,7 +761,7 @@ ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl libclang-dev && \
+    apt-get install -y curl libclang-dev cmake && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
@@ -862,25 +871,6 @@ RUN case "${PG_VERSION}" in "v17") \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

-#########################################################################################
-#
-# Layer "pg-session-jwt-build"
-# Compile "pg_session_jwt" extension
-#
-#########################################################################################
-
-FROM rust-extensions-build AS pg-session-jwt-build
-ARG PG_VERSION
-
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
-    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release
-
 #########################################################################################
 #
 # Layer "wal2json-build"
@@ -977,7 +967,6 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1055,12 +1044,9 @@ FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 ARG DEBIAN_FLAVOR
 RUN set -e \
    && apt-get update \
-    && apt-get install --no-install-recommends -y \
+    && apt-get install -y \
        build-essential \
        git \
-        ca-certificates \
-        autoconf \
-        automake \
        libevent-dev \
        libtool \
        pkg-config
@@ -1075,20 +1061,6 @@ RUN set -e \
    && make -j $(nproc) dist_man_MANS= \
    && make install dist_man_MANS=

-#########################################################################################
-#
-# Compile the Neon-specific `local_proxy` binary
-#
-#########################################################################################
-FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy
-
 #########################################################################################
 #
 # Layers "postgres-exporter" and "sql-exporter"
@@ -1182,6 +1154,11 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+# cmake is required for the h3 test
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt-get update && apt-get install -y cmake
 RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
@@ -1197,7 +1174,7 @@ RUN case "${PG_VERSION}" in "v17") \
    patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
-ENV PGPORT=30433
+ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
 #########################################################################################
@@ -1208,6 +1185,7 @@ ENV PGDATABASE=postgres
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
 ARG DEBIAN_FLAVOR
+ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
@@ -1227,10 +1205,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini

-# local_proxy and its config
-COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
-RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
-
 # Metrics exporter binaries and  configuration files
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
@@ -1284,7 +1258,7 @@ RUN apt update && \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
-        libcurl4 \
+        libcurl4-openssl-dev \
        locales \
        procps \
        ca-certificates \
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -94,68 +94,6 @@ metrics:
  query: |
    select sum(pg_database_size(datname)) as total from pg_database;

- metric_name: getpage_wait_seconds_count
-  type: counter
-  help: 'Number of getpage requests'
-  values: [getpage_wait_seconds_count]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_wait_seconds_sum
-  type: counter
-  help: 'Time spent in getpage requests'
-  values: [getpage_wait_seconds_sum]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_requests_total
-  type: counter
-  help: 'Number of getpage issued for prefetching'
-  values: [getpage_prefetch_requests_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_sync_requests_total
-  type: counter
-  help: 'Number of synchronous getpage issued'
-  values: [getpage_sync_requests_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_misses_total
-  type: counter
-  help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
-  values: [getpage_prefetch_misses_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_discards_total
-  type: counter
-  help: 'Number of prefetch responses issued but not used'
-  values: [getpage_prefetch_discards_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_requests_sent_total
-  type: counter
-  help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
-  values: [pageserver_requests_sent_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_disconnects_total
-  type: counter
-  help: 'Number of times that the connection to the pageserver was lost'
-  values: [pageserver_disconnects_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_send_flushes_total
-  type: counter
-  help: 'Number of flushes to the pageserver connection'
-  values: [pageserver_send_flushes_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_wait_seconds_bucket
-  type: counter
-  help: 'Histogram buckets of getpage request latency'
-  key_labels:
-      - bucket_le
-  values: [value]
-  query_ref: getpage_wait_seconds_buckets
-
 # DEPRECATED
 - metric_name: lfc_approximate_working_set_size
  type: gauge
@@ -306,26 +244,3 @@ metrics:
    SELECT slot_name,
           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
    FROM pg_replication_slots;
-
-queries:
-  - query_name: neon_perf_counters
-    query: |
-      WITH c AS (
-        SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
-      )
-      SELECT d.*
-      FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
-          getpage_wait_seconds_count numeric,
-          getpage_wait_seconds_sum numeric,
-          getpage_prefetch_requests_total numeric,
-          getpage_sync_requests_total numeric,
-          getpage_prefetch_misses_total numeric,
-          getpage_prefetch_discards_total numeric,
-          pageserver_requests_sent_total numeric,
-          pageserver_disconnects_total numeric,
-          pageserver_send_flushes_total numeric
-      );
-
-  - query_name: getpage_wait_seconds_buckets
-    query: |
-      SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -19,10 +19,6 @@ commands:
    user: postgres
    sysvInitAction: respawn
    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
-  - name: local_proxy
-    user: postgres
-    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,18 +11,16 @@ testing = []

 [dependencies]
 anyhow.workspace = true
-camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
+hyper = { workspace = true, features = ["full"] }
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
-opentelemetry_sdk.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde_json.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -218,7 +218,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    }
    if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry_sdk::propagation::TraceContextPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
            .extract(&startup_tracing_carrier)
            .attach();
@@ -402,7 +402,8 @@ fn start_postgres(
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
-    state.set_status(ComputeStatus::Init, &compute.state_changed);
+    state.status = ComputeStatus::Init;
+    compute.state_changed.notify_all();

    info!(
        "running compute with features: {:?}",
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,7 +34,6 @@ use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
-use crate::local_proxy;
 use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -109,18 +108,6 @@ impl ComputeState {
            metrics: ComputeMetrics::default(),
        }
    }
-
-    pub fn set_status(&mut self, status: ComputeStatus, state_changed: &Condvar) {
-        let prev = self.status;
-        info!("Changing compute status from {} to {}", prev, status);
-        self.status = status;
-        state_changed.notify_all();
-    }
-
-    pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) {
-        self.error = Some(format!("{err:?}"));
-        self.set_status(ComputeStatus::Failed, state_changed);
-    }
 }

 impl Default for ComputeState {
@@ -315,12 +302,15 @@ impl ComputeNode {

    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
-        state.set_status(status, &self.state_changed);
+        state.status = status;
+        self.state_changed.notify_all();
    }

    pub fn set_failed_status(&self, err: anyhow::Error) {
        let mut state = self.state.lock().unwrap();
-        state.set_failed_status(err, &self.state_changed);
+        state.error = Some(format!("{err:?}"));
+        state.status = ComputeStatus::Failed;
+        self.state_changed.notify_all();
    }

    pub fn get_status(&self) -> ComputeStatus {
@@ -896,11 +886,6 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

-        if let Some(ref local_proxy) = spec.local_proxy_config {
-            info!("configuring local_proxy");
-            local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
-        }
-
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut connstr = connstr.clone();
@@ -951,19 +936,6 @@ impl ComputeNode {
            });
        }

-        if let Some(ref local_proxy) = spec.local_proxy_config {
-            info!("configuring local_proxy");
-
-            // Spawn a thread to do the configuration,
-            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
-            let _handle = Some(thread::spawn(move || {
-                if let Err(err) = local_proxy::configure(&local_proxy) {
-                    error!("error while configuring local_proxy: {err:?}");
-                }
-            }));
-        }
-
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -1051,19 +1023,6 @@ impl ComputeNode {
            });
        }

-        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
-            info!("configuring local_proxy");
-
-            // Spawn a thread to do the configuration,
-            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
-            let _handle = thread::spawn(move || {
-                if let Err(err) = local_proxy::configure(&local_proxy) {
-                    error!("error while configuring local_proxy: {err:?}");
-                }
-            });
-        }
-
        info!(
            "start_compute spec.remote_extensions {:?}",
            pspec.spec.remote_extensions
@@ -1101,26 +1060,19 @@ impl ComputeNode {
        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
-
-                        self.apply_config(&compute_state)?;
-
-                        Ok(())
-                    },
-                )?;
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
                self.pg_reload_conf()?;
-            }
-            self.post_apply_config()?;
+
+                self.apply_config(&compute_state)?;
+
+                Ok(())
+            })?;
+            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -24,7 +24,8 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
        // Re-check the status after waking up
        if state.status == ComputeStatus::ConfigurationPending {
            info!("got configuration request");
-            state.set_status(ComputeStatus::Configuration, &compute.state_changed);
+            state.status = ComputeStatus::Configuration;
+            compute.state_changed.notify_all();
            drop(state);

            let mut new_status = ComputeStatus::Failed;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -288,7 +288,8 @@ async fn handle_configure_request(
                return Err((msg, StatusCode::PRECONDITION_FAILED));
            }
            state.pspec = Some(parsed_spec);
-            state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
+            state.status = ComputeStatus::ConfigurationPending;
+            compute.state_changed.notify_all();
            drop(state);
            info!("set new spec and notified waiters");
        }
@@ -361,15 +362,15 @@ async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (Str
        }
        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
            let msg = format!(
-                "invalid compute status for termination request: {}",
-                state.status
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
            );
            return Err((msg, StatusCode::PRECONDITION_FAILED));
        }
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
        drop(state);
    }
-
    forward_termination_signal();
    info!("sent signal and notified waiters");

@@ -383,8 +384,7 @@ async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (Str
        while state.status != ComputeStatus::Terminated {
            state = c.state_changed.wait(state).unwrap();
            info!(
-                "waiting for compute to become {}, current status: {:?}",
-                ComputeStatus::Terminated,
+                "waiting for compute to become Terminated, current status: {:?}",
                state.status
            );
        }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -2,9 +2,6 @@
 //! configuration.
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
-
-extern crate hyper0 as hyper;
-
 pub mod checker;
 pub mod config;
 pub mod configurator;
@@ -15,7 +12,6 @@ pub mod catalog;
 pub mod compute;
 pub mod disk_quota;
 pub mod extension_server;
-pub mod local_proxy;
 pub mod lsn_lease;
 mod migration;
 pub mod monitor;
--- a/compute_tools/src/local_proxy.rs
+++ b/compute_tools/src/local_proxy.rs
@@ -1,56 +0,0 @@
-//! Local Proxy is a feature of our BaaS Neon Authorize project.
-//!
-//! Local Proxy validates JWTs and manages the pg_session_jwt extension.
-//! It also maintains a connection pool to postgres.
-
-use anyhow::{Context, Result};
-use camino::Utf8Path;
-use compute_api::spec::LocalProxySpec;
-use nix::sys::signal::Signal;
-use utils::pid_file::{self, PidFileRead};
-
-pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> {
-    write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?;
-    notify_local_proxy("/etc/local_proxy/pid".as_ref())?;
-
-    Ok(())
-}
-
-/// Create or completely rewrite configuration file specified by `path`
-fn write_local_proxy_conf(path: &Utf8Path, local_proxy: &LocalProxySpec) -> Result<()> {
-    let config =
-        serde_json::to_string_pretty(local_proxy).context("serializing LocalProxySpec to json")?;
-    std::fs::write(path, config).with_context(|| format!("writing {path}"))?;
-
-    Ok(())
-}
-
-/// Notify local proxy about a new config file.
-fn notify_local_proxy(path: &Utf8Path) -> Result<()> {
-    match pid_file::read(path)? {
-        // if the file doesn't exist, or isn't locked, local_proxy isn't running
-        // and will naturally pick up our config later
-        PidFileRead::NotExist | PidFileRead::NotHeldByAnyProcess(_) => {}
-        PidFileRead::LockedByOtherProcess(pid) => {
-            // From the pid_file docs:
-            //
-            // > 1. The other process might exit at any time, turning the given PID stale.
-            // > 2. There is a small window in which `claim_for_current_process` has already
-            // >    locked the file but not yet updates its contents. [`read`] will return
-            // >    this variant here, but with the old file contents, i.e., a stale PID.
-            // >
-            // > The kernel is free to recycle PID once it has been `wait(2)`ed upon by
-            // > its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
-            // > system call on it, bears the risk of killing an unrelated process.
-            // > This is an inherent limitation of using pidfiles.
-            // > The only race-free solution is to have a supervisor-process with a lifetime
-            // > that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
-            //
-            // This is an ok risk as we only send a SIGHUP which likely won't actually
-            // kill the process, only reload config.
-            nix::sys::signal::kill(pid, Signal::SIGHUP).context("sending signal to local_proxy")?;
-        }
-    }
-
-    Ok(())
-}
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,3 +1,4 @@
+use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;

@@ -22,7 +23,8 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .with_writer(std::io::stderr);

    // Initialize OpenTelemetry
-    let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
+    let otlp_layer =
+        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);

    // Put it all together
    tracing_subscriber::registry()
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,4 +1,3 @@
-use std::collections::HashSet;
 use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
@@ -190,15 +189,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let mut xact = client.transaction()?;
    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;

-    let mut jwks_roles = HashSet::new();
-    if let Some(local_proxy) = &spec.local_proxy_config {
-        for jwks_setting in local_proxy.jwks.iter().flatten() {
-            for role_name in &jwks_setting.role_names {
-                jwks_roles.insert(role_name.clone());
-            }
-        }
-    }
-
    // Print a list of existing Postgres roles (only in debug mode)
    if span_enabled!(Level::INFO) {
        let mut vec = Vec::new();
@@ -318,9 +308,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
-                if jwks_roles.contains(name.as_str()) {
-                    query = format!("CREATE ROLE {}", name.pg_quote());
-                }
                info!("running role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -14,7 +14,7 @@ humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 humantime-serde.workspace = true
-hyper0.workspace = true
+hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -13,4 +13,4 @@ pg_port = 5454
 http_port = 7676

 [broker]
-listen_addr = '127.0.0.1:30051'
+listen_addr = '127.0.0.1:50051'
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -599,7 +599,6 @@ impl Endpoint {
            remote_extensions,
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -141,7 +141,7 @@ pub struct NeonLocalInitConf {
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct NeonBroker {
-    /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:30051'.
+    /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
    pub listen_addr: SocketAddr,
 }

@@ -168,9 +168,6 @@ pub struct NeonStorageControllerConf {

    #[serde(with = "humantime_serde")]
    pub heartbeat_interval: Duration,
-
-    #[serde(with = "humantime_serde")]
-    pub long_reconcile_threshold: Option<Duration>,
 }

 impl NeonStorageControllerConf {
@@ -193,7 +190,6 @@ impl Default for NeonStorageControllerConf {
            split_threshold: None,
            max_secondary_lag_bytes: None,
            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
-            long_reconcile_threshold: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,7 +3,7 @@ use crate::{
    local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper0::Uri;
+use hyper::Uri;
 use nix::unistd::Pid;
 use pageserver_api::{
    controller_api::{
@@ -517,13 +517,6 @@ impl StorageController {
            args.push(format!("--max-secondary-lag-bytes={lag}"))
        }

-        if let Some(threshold) = self.config.long_reconcile_threshold {
-            args.push(format!(
-                "--long-reconcile-threshold={}",
-                humantime::Duration::from(threshold)
-            ))
-        }
-
        args.push(format!(
            "--neon-local-repo-dir={}",
            self.env.base_data_dir.display()
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -40,7 +40,7 @@
            },
            {
                "name": "port",
-                "value": "30433",
+                "value": "55433",
                "vartype": "integer"
            },
            {
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -49,7 +49,7 @@ services:
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
      - SAFEKEEPER_ID=1
-      - BROKER_ENDPOINT=http://storage_broker:30051
+      - BROKER_ENDPOINT=http://storage_broker:50051
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
@@ -79,7 +79,7 @@ services:
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
      - SAFEKEEPER_ID=2
-      - BROKER_ENDPOINT=http://storage_broker:30051
+      - BROKER_ENDPOINT=http://storage_broker:50051
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
@@ -109,7 +109,7 @@ services:
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
      - SAFEKEEPER_ID=3
-      - BROKER_ENDPOINT=http://storage_broker:30051
+      - BROKER_ENDPOINT=http://storage_broker:50051
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
@@ -137,10 +137,10 @@ services:
    restart: always
    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    ports:
-      - 30051:30051
+      - 50051:50051
    command:
      - "storage_broker"
-      - "--listen-addr=0.0.0.0:30051"
+      - "--listen-addr=0.0.0.0:50051"

  compute:
    restart: always
@@ -160,7 +160,7 @@ services:
      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
      - ./compute_wrapper/shell/:/shell/
    ports:
-      - 30433:30433 # pg protocol handler
+      - 55433:55433 # pg protocol handler
      - 3080:3080 # http endpoints
    entrypoint:
      - "/shell/compute.sh"
@@ -176,7 +176,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 30433 -U cloud_admin ; do
+      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -1,4 +1,4 @@
-broker_endpoint='http://storage_broker:30051'
+broker_endpoint='http://storage_broker:50051'
 pg_distrib_dir='/usr/local/'
 listen_pg_addr='0.0.0.0:6400'
 listen_http_addr='0.0.0.0:9898'
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -5,7 +5,7 @@
 Currently we build two main images:

 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.

 And additional intermediate image:

@@ -47,7 +47,7 @@ Creating docker-compose_storage_broker_1       ... done

 2. connect compute node
 ```
-$ psql postgresql://cloud_admin:cloud_admin@localhost:30433/postgres
+$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
 psql (16.3)
 Type "help" for help.

@@ -68,7 +68,7 @@ postgres=# select * from t;
 # check the container name you want to see
 $ docker ps
 CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
-3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:30433->30433/tcp, :::30433->30433/tcp   docker-compose_compute_1
+3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
 (...omit...)

 $ docker logs -f docker-compose_compute_1
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
 - Username: `minio`
 - Password: `password`

-You can see durable pages and WAL data in `neon` bucket.
+You can see durable pages and WAL data in `neon` bucket.
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -1,112 +0,0 @@
-# AUX file v2
-
-## Summary
-
-This is a retrospective RFC describing a new storage strategy for AUX files.
-
-## Motivation
-
-The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`.
-Every time the compute node streams a `neon-file` record to the pageserver, it will
-update the aux file hash map, and then write the serialized hash map into the key.
-This creates serious space bloat. There was a fix to log delta records (i.e., update
-a key in the hash map) to the aux file key. In this way, the pageserver only stores
-the deltas at each of the LSNs. However, this improved v1 storage strategy still
-requires us to store everything in an aux file cache in memory, because we cannot
-fetch a single key (or file) from the compound `AUX_FILES_KEY`.
-
-### Prior art
-
-For storing large amount of small files, we can use a key-value store where the key
-is the filename and the value is the file content.
-
-## Requirements
-
- No space bloat, fixed space amplification.
- No write bloat, fixed write amplification.
-
-## Impacted Components
-
-pageserver
-
-## Sparse Keyspace
-
-In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF
-exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior
-assumption, there are code that traverses the keyspace by iterating every single key.
-
-```rust
-loop {
-    // do something
-    key = key.next();
-}
-```
-
-If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run.
-Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would
-exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead,
-they should fetch all the layer files in the key range, and then do a merge of them.
-
-In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`.
-
-## AUX v2 Keyspace and Key Mapping
-
-Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the
-keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash
-of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`.
-
-For example, `pg_logical/mappings/test1` will be encoded as:
-
-```
-62 0000 01 01 7F8B83D94F7081693471ABF91C
-^ aux prefix
-        ^ assigned prefix of pg_logical/
-           ^ assigned prefix of mappings/
-              ^ 13B FNV hash of test1
-   ^ not used due to key representation
-```
-
-The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace.
-
-Note that inside pageserver, there are two representations of the keys: the 18B full key representation
-and the 16B compact key representation. For the 18B representation, some fields have restricted ranges
-of values. Therefore, the aux keys only use the 16B compact portion of the full key.
-
-It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of
-each of the aux key is an array that contains all filenames and file content that should be stored in
-this key.
-
-We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before,
-and we do not need addition code to support reconstructing the value. We simply get the latest image from
-the storage.
-
-## Inbound Logical Replication Key Mapping
-
-For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data.
-This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during
-generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
-
-## Sparse Keyspace Read Path
-
-There are two places we need to read the aux files from the pageserver:
-
-* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
-*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
-
-## Compaction and Image Layer Generation
-
-With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage.
-
-* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes.
-* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later.
-
-## Migration
-
-We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration).
-
-During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached
-with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2).
-
-If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program.
-
-The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes.
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -24,7 +24,7 @@ max_file_descriptors = '100'
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'cloud_admin'

-broker_endpoint = 'http://127.0.0.1:30051'
+broker_endpoint = 'http://127.0.0.1:50051'

 # [remote_storage]
 ```
@@ -51,7 +51,7 @@ Note that TOML distinguishes between strings and integers, the former require si
 #### broker_endpoint

 A storage broker endpoint to connect and pull the information from. Default is
-`'http://127.0.0.1:30051'`.
+`'http://127.0.0.1:50051'`. 

 #### checkpoint_distance

--- a/docs/storage_broker.md
+++ b/docs/storage_broker.md
@@ -23,5 +23,5 @@ Broker serves /metrics on the same port as grpc service.

 grpcurl can be used to check which values are currently being pushed:
 ```
-grpcurl -proto broker/proto/broker.proto -d '{"all":{}}' -plaintext localhost:30051 storage_broker.BrokerService/SubscribeSafekeeperInfo
+grpcurl -proto broker/proto/broker.proto -d '{"all":{}}' -plaintext localhost:50051 storage_broker.BrokerService/SubscribeSafekeeperInfo
 ```
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,7 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

-use std::fmt::Display;
-
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};

@@ -60,21 +58,6 @@ pub enum ComputeStatus {
    Terminated,
 }

-impl Display for ComputeStatus {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            ComputeStatus::Empty => f.write_str("empty"),
-            ComputeStatus::ConfigurationPending => f.write_str("configuration-pending"),
-            ComputeStatus::Init => f.write_str("init"),
-            ComputeStatus::Running => f.write_str("running"),
-            ComputeStatus::Configuration => f.write_str("configuration"),
-            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending => f.write_str("termination-pending"),
-            ComputeStatus::Terminated => f.write_str("terminated"),
-        }
-    }
-}
-
 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
    S: Serializer,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -106,10 +106,6 @@ pub struct ComputeSpec {
    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
-
-    /// Local Proxy configuration used for JWT authentication
-    #[serde(default)]
-    pub local_proxy_config: Option<LocalProxySpec>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -282,13 +278,11 @@ pub struct GenericOption {
 /// declare a `trait` on it.
 pub type GenericOptions = Option<Vec<GenericOption>>;

-/// Configured the local_proxy application with the relevant JWKS and roles it should
+/// Configured the local-proxy application with the relevant JWKS and roles it should
 /// use for authorizing connect requests using JWT.
 #[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct LocalProxySpec {
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub jwks: Option<Vec<JwksSettings>>,
+    pub jwks: Vec<JwksSettings>,
 }

 #[derive(Clone, Debug, Deserialize, Serialize)]
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,7 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
+    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
    pub io_buffer_alignment: usize,
 }

@@ -296,14 +296,7 @@ pub mod defaults {

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

-    /// Soft limit for the maximum size of a vectored read.
-    ///
-    /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys
-    /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record,
-    /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record
-    /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`.
-    /// That is, slightly above 128 kB.
-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Zstd { level: Some(1) };
@@ -388,7 +381,7 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
+            virtual_file_io_mode: None,

            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -748,16 +748,6 @@ impl Key {
        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
    }

-    #[inline(always)]
-    pub fn is_rel_dir_key(&self) -> bool {
-        self.field1 == 0x00
-            && self.field2 != 0
-            && self.field3 != 0
-            && self.field4 == 0
-            && self.field5 == 0
-            && self.field6 == 1
-    }
-
    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
    #[inline(always)]
    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -994,50 +992,49 @@ pub mod virtual_file {
    }

    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    #[repr(u8)]
+    pub enum IoMode {
+        /// Uses buffered IO.
+        Buffered,
+        /// Uses direct IO, error out if the operation fails.
+        #[cfg(target_os = "linux")]
+        Direct,
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
+    impl IoMode {
+        pub const fn preferred() -> Self {
+            if cfg!(target_os = "linux") {
+                Self::Direct
+            } else {
+                Self::Buffered
+            }
+        }
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
+    impl TryFrom<u8> for IoMode {
+        type Error = u8;

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
+        fn try_from(value: u8) -> Result<Self, Self::Error> {
+            Ok(match value {
+                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
+                #[cfg(target_os = "linux")]
+                v if v == (IoMode::Direct as u8) => IoMode::Direct,
+                x => return Err(x),
+            })
+        }
    }
 }

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
-hyper0 = { workspace = true, features = ["stream"] }
+hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -14,7 +14,7 @@ use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
-use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
+use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::{Continuable, RetryOptions};
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
@@ -33,10 +33,10 @@ use tracing::debug;
 use utils::backoff;

 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::ListingObject;
 use crate::{
-    config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
-    DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
+    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -259,7 +259,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
    if let Some(http_err) = error.as_http_error() {
        match http_err.status() {
            StatusCode::NotFound => DownloadError::NotFound,
-            StatusCode::NotModified => DownloadError::Unmodified,
            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
            _ => DownloadError::Other(anyhow::Error::new(error)),
        }
@@ -485,16 +484,11 @@ impl RemoteStorage for AzureBlobStorage {
    async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let mut builder = blob_client.get();
-
-        if let Some(ref etag) = opts.etag {
-            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
-        }
+        let builder = blob_client.get();

        self.download_for_builder(builder, cancel).await
    }
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -5,8 +5,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// The caller provided an ETag, and the file was not modified.
-    Unmodified,
    /// A cancellation token aborted the download, typically during
    /// tenant detach or process shutdown.
    Cancelled,
@@ -26,7 +24,6 @@ impl std::fmt::Display for DownloadError {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Unmodified => write!(f, "File was not modified"),
            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::Timeout => write!(f, "timeout"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
@@ -41,7 +38,7 @@ impl DownloadError {
    pub fn is_permanent(&self) -> bool {
        use DownloadError::*;
        match self {
-            BadInput(_) | NotFound | Unmodified | Cancelled => true,
+            BadInput(_) | NotFound | Cancelled => true,
            Timeout | Other(_) => false,
        }
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -161,14 +161,6 @@ pub struct Listing {
    pub keys: Vec<ListingObject>,
 }

-/// Options for downloads. The default value is a plain GET.
-#[derive(Default)]
-pub struct DownloadOpts {
-    /// If given, returns [`DownloadError::Unmodified`] if the object still has
-    /// the same ETag (using If-None-Match).
-    pub etag: Option<Etag>,
-}
-
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -253,7 +245,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError>;

@@ -410,18 +401,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    /// See [`RemoteStorage::download`]
    pub async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.download(from, opts, cancel).await,
-            Self::AwsS3(s) => s.download(from, opts, cancel).await,
-            Self::AzureBlob(s) => s.download(from, opts, cancel).await,
-            Self::Unreliable(s) => s.download(from, opts, cancel).await,
+            Self::LocalFs(s) => s.download(from, cancel).await,
+            Self::AwsS3(s) => s.download(from, cancel).await,
+            Self::AzureBlob(s) => s.download(from, cancel).await,
+            Self::Unreliable(s) => s.download(from, cancel).await,
        }
    }

@@ -583,7 +572,7 @@ impl GenericRemoteStorage {
    ) -> Result<Download, DownloadError> {
        match byte_range {
            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
-            None => self.download(from, &DownloadOpts::default(), cancel).await,
+            None => self.download(from, cancel).await,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError,
+    TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -494,17 +494,11 @@ impl RemoteStorage for LocalFs {
    async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);

        let file_metadata = file_metadata(&target_path).await?;
-        let etag = mock_etag(&file_metadata);
-
-        if opts.etag.as_ref() == Some(&etag) {
-            return Err(DownloadError::Unmodified);
-        }

        let source = ReaderStream::new(
            fs::OpenOptions::new()
@@ -525,6 +519,7 @@ impl RemoteStorage for LocalFs {
        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);

+        let etag = mock_etag(&file_metadata);
        Ok(Download {
            metadata,
            last_modified: file_metadata
@@ -697,7 +692,7 @@ mod fs_tests {
    ) -> anyhow::Result<String> {
        let cancel = CancellationToken::new();
        let download = storage
-            .download(remote_storage_path, &DownloadOpts::default(), &cancel)
+            .download(remote_storage_path, &cancel)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
        ensure!(
@@ -778,8 +773,8 @@ mod fs_tests {
            "We should upload and download the same contents"
        );

-        let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?;
-        match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await {
+        let non_existing_path = "somewhere/else";
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -1106,13 +1101,7 @@ mod fs_tests {
            storage.upload(body, len, &path, None, &cancel).await?;
        }

-        let read = aggregate(
-            storage
-                .download(&path, &DownloadOpts::default(), &cancel)
-                .await?
-                .download_stream,
-        )
-        .await?;
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
        assert_eq!(body, read);

        let shorter = Bytes::from_static(b"shorter body");
@@ -1123,13 +1112,7 @@ mod fs_tests {
            storage.upload(body, len, &path, None, &cancel).await?;
        }

-        let read = aggregate(
-            storage
-                .download(&path, &DownloadOpts::default(), &cancel)
-                .await?
-                .download_stream,
-        )
-        .await?;
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
        assert_eq!(shorter, read);
        Ok(())
    }
@@ -1162,13 +1145,7 @@ mod fs_tests {
            storage.upload(body, len, &path, None, &cancel).await?;
        }

-        let read = aggregate(
-            storage
-                .download(&path, &DownloadOpts::default(), &cancel)
-                .await?
-                .download_stream,
-        )
-        .await?;
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
        assert_eq!(body, read);

        Ok(())
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -28,13 +28,12 @@ use aws_sdk_s3::{
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
-use http_types::StatusCode;

 use aws_smithy_types::{body::SdkBody, DateTime};
 use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
-use hyper0::Body;
+use hyper::Body;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -45,8 +44,8 @@ use crate::{
    error::Cancelled,
    metrics::{start_counting_cancelled_wait, start_measuring_requests},
    support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
-    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath,
+    RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

@@ -68,7 +67,6 @@ pub struct S3Bucket {
 struct GetObjectRequest {
    bucket: String,
    key: String,
-    etag: Option<String>,
    range: Option<String>,
 }
 impl S3Bucket {
@@ -250,18 +248,13 @@ impl S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let mut builder = self
+        let get_object = self
            .client
            .get_object()
            .bucket(request.bucket)
            .key(request.key)
-            .set_range(request.range);
-
-        if let Some(etag) = request.etag {
-            builder = builder.if_none_match(etag);
-        }
-
-        let get_object = builder.send();
+            .set_range(request.range)
+            .send();

        let get_object = tokio::select! {
            res = get_object => res,
@@ -284,20 +277,6 @@ impl S3Bucket {
                );
                return Err(DownloadError::NotFound);
            }
-            Err(SdkError::ServiceError(e))
-                // aws_smithy_runtime_api::http::response::StatusCode isn't
-                // re-exported by any aws crates, so just check the numeric
-                // status against http_types::StatusCode instead of pulling it.
-                if e.raw().status().as_u16() == StatusCode::NotModified =>
-            {
-                // Count an unmodified file as a success.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
-                return Err(DownloadError::Unmodified);
-            }
            Err(e) => {
                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
@@ -794,7 +773,6 @@ impl RemoteStorage for S3Bucket {
    async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
@@ -803,7 +781,6 @@ impl RemoteStorage for S3Bucket {
            GetObjectRequest {
                bucket: self.bucket_name.clone(),
                key: self.relative_path_to_s3_object(from),
-                etag: opts.etag.as_ref().map(|e| e.to_string()),
                range: None,
            },
            cancel,
@@ -830,7 +807,6 @@ impl RemoteStorage for S3Bucket {
            GetObjectRequest {
                bucket: self.bucket_name.clone(),
                key: self.relative_path_to_s3_object(from),
-                etag: None,
                range,
            },
            cancel,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,8 +12,8 @@ use std::{collections::hash_map::Entry, sync::Arc};
 use tokio_util::sync::CancellationToken;

 use crate::{
-    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata, TimeTravelError,
+    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
+    StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -167,12 +167,11 @@ impl RemoteStorage for UnreliableWrapper {
    async fn download(
        &self,
        from: &RemotePath,
-        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        self.attempt(RemoteOp::Download(from.clone()))
            .map_err(DownloadError::Other)?;
-        self.inner.download(from, opts, cancel).await
+        self.inner.download(from, cancel).await
    }

    async fn download_byte_range(
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,7 +1,8 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use futures::StreamExt;
-use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath};
+use remote_storage::ListingMode;
+use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
@@ -283,10 +284,7 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    ctx.client.upload(data, len, &path, None, &cancel).await?;

    // Normal download request
-    let dl = ctx
-        .client
-        .download(&path, &DownloadOpts::default(), &cancel)
-        .await?;
+    let dl = ctx.client.download(&path, &cancel).await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

@@ -339,54 +337,6 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    Ok(())
 }

-/// Tests that conditional downloads work properly, by returning
-/// DownloadError::Unmodified when the object ETag matches the given ETag.
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn download_conditional(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-    let cancel = CancellationToken::new();
-
-    // Create a file.
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?;
-    let data = bytes::Bytes::from_static("foo".as_bytes());
-    let (stream, len) = wrap_stream(data);
-    ctx.client.upload(stream, len, &path, None, &cancel).await?;
-
-    // Download it to obtain its etag.
-    let mut opts = DownloadOpts::default();
-    let download = ctx.client.download(&path, &opts, &cancel).await?;
-
-    // Download with the etag yields DownloadError::Unmodified.
-    opts.etag = Some(download.etag);
-    let result = ctx.client.download(&path, &opts, &cancel).await;
-    assert!(
-        matches!(result, Err(DownloadError::Unmodified)),
-        "expected DownloadError::Unmodified, got {result:?}"
-    );
-
-    // Replace the file contents.
-    let data = bytes::Bytes::from_static("bar".as_bytes());
-    let (stream, len) = wrap_stream(data);
-    ctx.client.upload(stream, len, &path, None, &cancel).await?;
-
-    // A download with the old etag should yield the new file.
-    let download = ctx.client.download(&path, &opts, &cancel).await?;
-    assert_ne!(download.etag, opts.etag.unwrap(), "ETag did not change");
-
-    // A download with the new etag should yield Unmodified again.
-    opts.etag = Some(download.etag);
-    let result = ctx.client.download(&path, &opts, &cancel).await;
-    assert!(
-        matches!(result, Err(DownloadError::Unmodified)),
-        "expected DownloadError::Unmodified, got {result:?}"
-    );
-
-    Ok(())
-}
-
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -414,10 +364,7 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
    // Normal download request
    ctx.client.copy_object(&path, &path_dest, &cancel).await?;

-    let dl = ctx
-        .client
-        .download(&path_dest, &DownloadOpts::default(), &cancel)
-        .await?;
+    let dl = ctx.client.download(&path_dest, &cancel).await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

@@ -429,56 +376,3 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {

    Ok(())
 }
-
-/// Tests that head_object works properly.
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn head_object(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-    let cancel = CancellationToken::new();
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?;
-
-    // Errors on missing file.
-    let result = ctx.client.head_object(&path, &cancel).await;
-    assert!(
-        matches!(result, Err(DownloadError::NotFound)),
-        "expected NotFound, got {result:?}"
-    );
-
-    // Create the file.
-    let data = bytes::Bytes::from_static("foo".as_bytes());
-    let (stream, len) = wrap_stream(data);
-    ctx.client.upload(stream, len, &path, None, &cancel).await?;
-
-    // Fetch the head metadata.
-    let object = ctx.client.head_object(&path, &cancel).await?;
-    assert_eq!(
-        object,
-        ListingObject {
-            key: path.clone(),
-            last_modified: object.last_modified, // ignore
-            size: 3
-        }
-    );
-
-    // Wait for a couple of seconds, and then update the file to check the last
-    // modified timestamp.
-    tokio::time::sleep(std::time::Duration::from_secs(2)).await;
-
-    let data = bytes::Bytes::from_static("bar".as_bytes());
-    let (stream, len) = wrap_stream(data);
-    ctx.client.upload(stream, len, &path, None, &cancel).await?;
-    let new = ctx.client.head_object(&path, &cancel).await?;
-
-    assert!(
-        !new.last_modified
-            .duration_since(object.last_modified)?
-            .is_zero(),
-        "last_modified did not advance"
-    );
-
-    Ok(())
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
-    RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -121,8 +121,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:

    // A little check to ensure that our clock is not too far off from the S3 clock
    {
-        let opts = DownloadOpts::default();
-        let dl = retry(|| ctx.client.download(&path2, &opts, &cancel)).await?;
+        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
        let last_modified = dl.last_modified;
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
@@ -160,12 +159,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t2: {t2_files_recovered:?}");
    assert_eq!(t2_files, t2_files_recovered);
-    let path2_recovered_t2 = download_to_vec(
-        ctx.client
-            .download(&path2, &DownloadOpts::default(), &cancel)
-            .await?,
-    )
-    .await?;
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
    assert_eq!(path2_recovered_t2, new_data.as_bytes());

    // after recovery to t1: path1 is back, path2 has the old content
@@ -176,12 +170,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t1: {t1_files_recovered:?}");
    assert_eq!(t1_files, t1_files_recovered);
-    let path2_recovered_t1 = download_to_vec(
-        ctx.client
-            .download(&path2, &DownloadOpts::default(), &cancel)
-            .await?,
-    )
-    .await?;
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
    assert_eq!(path2_recovered_t1, old_data.as_bytes());

    // after recovery to t0: everything is gone except for path1
@@ -427,7 +416,7 @@ async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
    let started_at = std::time::Instant::now();
    let mut stream = ctx
        .client
-        .download(&path, &DownloadOpts::default(), &cancel)
+        .download(&path, &cancel)
        .await
        .expect("download succeeds")
        .download_stream;
@@ -502,7 +491,7 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
    {
        let stream = ctx
            .client
-            .download(&path, &DownloadOpts::default(), &cancel)
+            .download(&path, &cancel)
            .await
            .expect("download succeeds")
            .download_stream;
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -5,15 +5,13 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-hyper0.workspace = true
-opentelemetry = { workspace = true, features = ["trace"] }
-opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-tracing-subscriber.workspace = true

 [dev-dependencies]
 tracing-subscriber.workspace = true    # For examples in docs
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -1,7 +1,7 @@
 //! Tracing wrapper for Hyper HTTP server

-use hyper0::HeaderMap;
-use hyper0::{Body, Request, Response};
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
 use std::future::Future;
 use tracing::Instrument;
 use tracing_opentelemetry::OpenTelemetrySpanExt;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -10,6 +10,7 @@
 //!
 //! ```rust,no_run
 //! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
 //!
 //! #[tokio::main]
 //! async fn main() {
@@ -21,7 +22,7 @@
 //!         .with_writer(std::io::stderr);
 //!
 //!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
-//!     let otlp_layer = tracing_utils::init_tracing("my_application").await;
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
 //!
 //!     // Put it all together
 //!     tracing_subscriber::registry()
@@ -34,14 +35,14 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

-pub mod http;
-
-use opentelemetry::trace::TracerProvider;
+use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
-use opentelemetry_sdk::Resource;
-use tracing::Subscriber;
-use tracing_subscriber::registry::LookupSpan;
-use tracing_subscriber::Layer;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;

 /// Set up OpenTelemetry exporter, using configuration from environment variables.
 ///
@@ -70,10 +71,7 @@ use tracing_subscriber::Layer;
 ///
 /// This doesn't block, but is marked as 'async' to hint that this must be called in
 /// asynchronous execution context.
-pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
-where
-    S: Subscriber + for<'span> LookupSpan<'span>,
-{
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -82,10 +80,9 @@ where

 /// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
 /// tasks.
-pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
-where
-    S: Subscriber + for<'span> LookupSpan<'span>,
-{
+pub fn init_tracing_without_runtime(
+    service_name: &str,
+) -> Option<opentelemetry::sdk::trace::Tracer> {
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -116,36 +113,54 @@ where
    Some(init_tracing_internal(service_name.to_string()))
 }

-fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
-where
-    S: Subscriber + for<'span> LookupSpan<'span>,
-{
-    // Sets up exporter from the OTEL_EXPORTER_* environment variables.
-    let exporter = opentelemetry_otlp::new_exporter().http();
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+    // Set up exporter from the OTEL_EXPORTER_* environment variables
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();

-    // TODO: opentelemetry::global::set_error_handler() with custom handler that
-    //       bypasses default tracing layers, but logs regular looking log
-    //       messages.
+    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+    // OpenTelemetry spec at
+    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
+    // the full exporter URL is formed by appending "/v1/traces" to the value
+    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+    // that with the grpc-tonic exporter. Other exporters, like the HTTP
+    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+    // appending "/v1/traces".
+    //
+    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+    //
+    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+    // fixed in a later version, we can remove this code. But if we don't
+    // remember to remove this, it won't do any harm either, as the crate will
+    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+    // is set directly with `with_endpoint`.
+    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+            if !endpoint.ends_with('/') {
+                endpoint.push('/');
+            }
+            endpoint.push_str("v1/traces");
+            exporter = exporter.with_endpoint(endpoint);
+        }
+    }

    // Propagate trace information in the standard W3C TraceContext format.
    opentelemetry::global::set_text_map_propagator(
-        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
    );

-    let tracer = opentelemetry_otlp::new_pipeline()
+    opentelemetry_otlp::new_pipeline()
        .tracing()
        .with_exporter(exporter)
-        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
-            Resource::new(vec![KeyValue::new(
+        .with_trace_config(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
                service_name,
-            )]),
-        ))
-        .install_batch(opentelemetry_sdk::runtime::Tokio)
+            )])),
+        )
+        .install_batch(opentelemetry::runtime::Tokio)
        .expect("could not initialize opentelemetry exporter")
-        .tracer("global");
-
-    tracing_opentelemetry::layer().with_tracer(tracer)
 }

 // Shutdown trace pipeline gracefully, so that it has a chance to send any
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,7 +22,7 @@ chrono.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
+hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -2,8 +2,6 @@
 //! between other crates in this repository.
 #![deny(clippy::undocumented_unsafe_blocks)]

-extern crate hyper0 as hyper;
-
 pub mod backoff;

 /// `Lsn` type implements common tasks on Log Sequence Numbers
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -7,13 +7,11 @@ use axum::{
    extract::{ws::WebSocket, State, WebSocketUpgrade},
    response::Response,
 };
-use axum::{routing::get, Router};
+use axum::{routing::get, Router, Server};
 use clap::Parser;
 use futures::Future;
-use std::net::SocketAddr;
 use std::{fmt::Debug, time::Duration};
 use sysinfo::{RefreshKind, System, SystemExt};
-use tokio::net::TcpListener;
 use tokio::{sync::broadcast, task::JoinHandle};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info};
@@ -134,14 +132,14 @@ pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Res
            args,
        });

-    let addr_str = args.addr();
-    let addr: SocketAddr = addr_str.parse().expect("parsing address should not fail");
-
-    let listener = TcpListener::bind(&addr)
-        .await
+    let addr = args.addr();
+    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
        .with_context(|| format!("failed to bind to {addr}"))?;
-    info!(addr_str, "server bound");
-    axum::serve(listener, app.into_make_service())
+
+    info!(addr, "server bound");
+
+    bound
+        .serve(app.into_make_service())
        .await
        .context("server exited")?;

--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -79,7 +79,8 @@ pub struct Config {
    /// memory.
    ///
    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
-    /// cgroup is using more than 85% of total memory.
+    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
+    /// memory for the file cache).
    cgroup_min_overhead_fraction: f64,

    cgroup_downscale_threshold_buffer_bytes: u64,
@@ -96,12 +97,24 @@ impl Default for Config {
 }

 impl Config {
-    fn cgroup_threshold(&self, total_mem: u64) -> u64 {
-        // We want our threshold to be met gracefully instead of letting postgres get OOM-killed
-        // (or if there's room, spilling to swap).
+    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
+        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
+        // and thus be non-reclaimable, so we should allow for additional memory usage.
+        //
+        // If the file cache sits on disk, our desired stable system state is for it to be fully
+        // page cached (its contents should only be paged to/from disk in situations where we can't
+        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
+        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
+        // out the file cache.
+        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
+
+        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
+        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
        // remaining above the threshold.
-        (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64
+        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
+
+        memory_remaining_for_cgroup.min(max_threshold)
    }
 }

@@ -136,6 +149,11 @@ impl Runner {

        let mem = get_total_system_memory();

+        let mut file_cache_disk_size = 0;
+
+        // We need to process file cache initialization before cgroup initialization, so that the memory
+        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
+        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
            let config = FileCacheConfig::default();
@@ -166,6 +184,7 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

+            file_cache_disk_size = actual_size;
            state.filecache = Some(file_cache);
        }

@@ -188,7 +207,7 @@ impl Runner {
                cgroup.watch(hist_tx).await
            });

-            let threshold = state.config.cgroup_threshold(mem);
+            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
            info!(threshold, "set initial cgroup threshold",);

            state.cgroup = Some(CgroupState {
@@ -240,7 +259,9 @@ impl Runner {
                return Ok((false, status.to_owned()));
            }

-            let new_threshold = self.config.cgroup_threshold(usable_system_memory);
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, expected_file_cache_size);

            let current = last_history.avg_non_reclaimable;

@@ -261,11 +282,13 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
+            file_cache_disk_size = actual_usage;
            let message = format!(
                "set file cache size to {} MiB",
                bytes_to_mebibytes(actual_usage),
@@ -275,7 +298,9 @@ impl Runner {
        }

        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self.config.cgroup_threshold(usable_system_memory);
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);

            let message = format!(
                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
@@ -304,6 +329,7 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -316,6 +342,7 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
+            file_cache_disk_size = actual_usage;

            if actual_usage != expected_usage {
                warn!(
@@ -327,7 +354,9 @@ impl Runner {
        }

        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self.config.cgroup_threshold(usable_system_memory);
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);

            info!(
                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -30,7 +30,7 @@ futures.workspace = true
 hex.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
-hyper0.workspace = true
+hyper.workspace = true
 itertools.workspace = true
 md5.workspace = true
 nix.workspace = true
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,12 +164,10 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    page_cache::init(conf.page_cache_size);
+
+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(16384, virtual_file::io_engine_for_bench(), align);
+    page_cache::init(conf.page_cache_size, align);

    {
        let mut group = c.benchmark_group("ingest-small-values");
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -550,6 +550,19 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// Configs io mode at runtime.
+    pub async fn put_io_mode(
+        &self,
+        mode: &pageserver_api::models::virtual_file::IoMode,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, mode)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -151,13 +151,10 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    pageserver::page_cache::init(100);
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    pageserver::page_cache::init(100, align);

    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,8 +59,9 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
-    page_cache::init(100);
+    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    page_cache::init(100, align);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
@@ -190,12 +191,10 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
-            pageserver::page_cache::init(100);
+            let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+            pageserver::page_cache::init(100, align);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -205,12 +205,9 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
-    page_cache::init(100);
+    let align = DEFAULT_IO_BUFFER_ALIGNMENT;
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
+    page_cache::init(100, align);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -63,6 +63,10 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_alignment: Option<usize>,

+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
+    #[clap(long)]
+    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
+
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -133,6 +137,10 @@ async fn main_impl(
        mgmt_api_client.put_io_alignment(align).await?;
    }

+    if let Some(mode) = &args.set_io_mode {
+        mgmt_api_client.put_io_mode(mode).await?;
+    }
+
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,7 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
+    info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings");
    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    // The tenants directory contains all the pageserver local disk state.
@@ -173,7 +173,7 @@ fn main() -> anyhow::Result<()> {
        conf.virtual_file_io_engine,
        conf.io_buffer_alignment,
    );
-    page_cache::init(conf.page_cache_size);
+    page_cache::init(conf.page_cache_size, conf.io_buffer_alignment);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

@@ -575,7 +575,7 @@ fn start_pageserver(
            .build()
            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
-        let server = hyper0::Server::from_tcp(http_listener)?
+        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
            .with_graceful_shutdown({
                let cancel = cancel.clone();
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,7 +174,7 @@ pub struct PageServerConf {
    pub l0_flush: crate::l0_flush::L0FlushConfig,

    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
+    pub virtual_file_io_mode: virtual_file::IoMode,

    pub io_buffer_alignment: usize,
 }
@@ -325,7 +325,7 @@ impl PageServerConf {
            image_compression,
            ephemeral_bytes_per_memory_kb,
            l0_flush,
-            virtual_file_direct_io,
+            virtual_file_io_mode,
            concurrent_tenant_warmup,
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
@@ -368,7 +368,6 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
-            virtual_file_direct_io,
            io_buffer_alignment,

            // ------------------------------------------------------------
@@ -408,6 +407,7 @@ impl PageServerConf {
            l0_flush: l0_flush
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
+            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
@@ -56,7 +57,6 @@ use utils::http::endpoint::request_span;
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -81,6 +81,7 @@ use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -1719,13 +1720,8 @@ async fn timeline_gc_handler(

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let state = get_state(&request);
-
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let gc_result = state
-        .tenant_manager
-        .immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)
-        .await?;
+    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;

    json_response(StatusCode::OK, gc_result)
 }
@@ -1742,10 +1738,6 @@ async fn timeline_compact_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-
-    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
-        flags |= CompactFlags::ForceL0Compaction;
-    }
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
@@ -1792,9 +1784,6 @@ async fn timeline_checkpoint_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
-        flags |= CompactFlags::ForceL0Compaction;
-    }
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
@@ -2393,6 +2382,16 @@ async fn put_io_alignment_handler(
    json_response(StatusCode::OK, ())
 }

+async fn put_io_mode_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let mode: IoMode = json_request(&mut r).await?;
+    crate::virtual_file::set_io_mode(mode);
+    json_response(StatusCode::OK, ())
+}
+
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -3083,6 +3082,7 @@ pub fn make_router(
        .put("/v1/io_alignment", |r| {
            api_handler(r, put_io_alignment_handler)
        })
+        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,8 +13,6 @@ pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;

-extern crate hyper0 as hyper;
-
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    virtual_file::{self, dio::IoBufferMut},
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -90,8 +91,8 @@ const TEST_PAGE_CACHE_SIZE: usize = 50;
 ///
 /// Initialize the page cache. This must be called once at page server startup.
 ///
-pub fn init(size: usize) {
-    if PAGE_CACHE.set(PageCache::new(size)).is_err() {
+pub fn init(size: usize, align: usize) {
+    if PAGE_CACHE.set(PageCache::new(size, align)).is_err() {
        panic!("page cache already initialized");
    }
 }
@@ -106,7 +107,12 @@ pub fn get() -> &'static PageCache {
    // page cache is usable in unit tests.
    //
    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
+        PAGE_CACHE.get_or_init(|| {
+            PageCache::new(
+                TEST_PAGE_CACHE_SIZE,
+                virtual_file::get_io_buffer_alignment(),
+            )
+        })
    } else {
        PAGE_CACHE.get().expect("page cache not initialized")
    }
@@ -637,13 +643,11 @@ impl PageCache {
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
+    fn new(num_pages: usize, align: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+        let page_buffer =
+            IoBufferMut::with_capacity_aligned_zeroed(num_pages * PAGE_SZ, align).leak();

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,7 +97,6 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
-use crate::walingest::WalLagCooldown;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
@@ -320,9 +319,6 @@ pub struct Tenant {
    /// background warmup.
    pub(crate) activate_now_sem: tokio::sync::Semaphore,

-    /// Time it took for the tenant to activate. Zero if not active yet.
-    attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,
-
    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,
@@ -1004,15 +1000,11 @@ impl Tenant {
                // Remote preload is complete.
                drop(remote_load_completion);

-
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
-                let attach_start = std::time::Instant::now();
                let attached = {
                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
-                let attach_duration = attach_start.elapsed();
-                _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));

                match attached {
                    Ok(()) => {
@@ -2762,7 +2754,6 @@ impl Tenant {
            pg_version,
            state,
            last_aux_file_policy,
-            self.attach_wal_lag_cooldown.clone(),
            self.cancel.child_token(),
        );

@@ -2869,7 +2860,6 @@ impl Tenant {
                Some(Duration::from_secs(3600 * 24)),
            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
-            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,6 +8,7 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
+use anyhow::bail;
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithmSettings;
@@ -440,6 +441,29 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    }
 }

+impl TryFrom<toml_edit::Item> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
+        match item {
+            toml_edit::Item::Value(value) => {
+                let d = value.into_deserializer();
+                return serde_path_to_error::deserialize(d)
+                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
+            }
+            toml_edit::Item::Table(table) => {
+                let deserializer =
+                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
+                return serde_path_to_error::deserialize(deserializer)
+                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
+            }
+            _ => {
+                bail!("expected non-inline table but found {item}")
+            }
+        }
+    }
+}
+
 /// This is a conversion from our internal tenant config object to the one used
 /// in external APIs.
 impl From<TenantConfOpt> for models::TenantConfig {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let path = self.buffered_writer.as_inner().as_inner().path();
        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
@@ -356,7 +356,7 @@ mod tests {
        }

        let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
        assert_eq!(file_contents, &content[0..cap]);

        let buffer_contents = file.buffered_writer.inspect_buffer();
@@ -392,7 +392,7 @@ mod tests {
            .buffered_writer
            .as_inner()
            .as_inner()
-            .path
+            .path()
            .metadata()
            .unwrap();
        assert_eq!(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2197,82 +2197,6 @@ impl TenantManager {

        Ok((wanted_bytes, shard_count as u32))
    }
-
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
-    pub(crate) async fn immediate_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        gc_req: TimelineGcRequest,
-        cancel: CancellationToken,
-        ctx: &RequestContext,
-    ) -> Result<GcResult, ApiError> {
-        let tenant = {
-            let guard = self.tenants.read().unwrap();
-            guard
-                .get(&tenant_shard_id)
-                .cloned()
-                .with_context(|| format!("tenant {tenant_shard_id}"))
-                .map_err(|e| ApiError::NotFound(e.into()))?
-        };
-
-        let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
-        // Use tenant's pitr setting
-        let pitr = tenant.get_pitr_interval();
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        // Run in task_mgr to avoid race with tenant_detach operation
-        let ctx: RequestContext =
-            ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-
-        let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
-
-        fail::fail_point!("immediate_gc_task_pre");
-
-        #[allow(unused_mut)]
-        let mut result = tenant
-            .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-            .await;
-        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-        // better once the types support it.
-
-        #[cfg(feature = "testing")]
-        {
-            // we need to synchronize with drop completion for python tests without polling for
-            // log messages
-            if let Ok(result) = result.as_mut() {
-                let mut js = tokio::task::JoinSet::new();
-                for layer in std::mem::take(&mut result.doomed_layers) {
-                    js.spawn(layer.wait_drop());
-                }
-                tracing::info!(
-                    total = js.len(),
-                    "starting to wait for the gc'd layers to be dropped"
-                );
-                while let Some(res) = js.join_next().await {
-                    res.expect("wait_drop should not panic");
-                }
-            }
-
-            let timeline = tenant.get_timeline(timeline_id, false).ok();
-            let rtc = timeline.as_ref().map(|x| &x.remote_client);
-
-            if let Some(rtc) = rtc {
-                // layer drops schedule actions on remote timeline client to actually do the
-                // deletions; don't care about the shutdown error, just exit fast
-                drop(rtc.wait_completion().await);
-            }
-        }
-
-        result.map_err(|e| match e {
-            GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
-            GcError::TimelineNotFound => {
-                ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
-            }
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2417,7 +2341,7 @@ enum TenantSlotDropError {
 /// Errors that can happen any time we are walking the tenant map to try and acquire
 /// the TenantSlot for a particular tenant.
 #[derive(Debug, thiserror::Error)]
-pub(crate) enum TenantMapError {
+pub enum TenantMapError {
    // Tried to read while initializing
    #[error("tenant map is still initializing")]
    StillInitializing,
@@ -2447,7 +2371,7 @@ pub(crate) enum TenantMapError {
 /// The `old_value` may be dropped before the SlotGuard is dropped, by calling
 /// `drop_old_value`.  It is an error to call this without shutting down
 /// the conents of `old_value`.
-pub(crate) struct SlotGuard {
+pub struct SlotGuard {
    tenant_shard_id: TenantShardId,
    old_value: Option<TenantSlot>,
    upserted: bool,
@@ -2840,6 +2764,81 @@ use {
    utils::http::error::ApiError,
 };

+#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+pub(crate) async fn immediate_gc(
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    gc_req: TimelineGcRequest,
+    cancel: CancellationToken,
+    ctx: &RequestContext,
+) -> Result<GcResult, ApiError> {
+    let tenant = {
+        let guard = TENANTS.read().unwrap();
+        guard
+            .get(&tenant_shard_id)
+            .cloned()
+            .with_context(|| format!("tenant {tenant_shard_id}"))
+            .map_err(|e| ApiError::NotFound(e.into()))?
+    };
+
+    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
+    // Use tenant's pitr setting
+    let pitr = tenant.get_pitr_interval();
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx: RequestContext =
+        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+
+    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
+
+    fail::fail_point!("immediate_gc_task_pre");
+
+    #[allow(unused_mut)]
+    let mut result = tenant
+        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+        .await;
+    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+    // better once the types support it.
+
+    #[cfg(feature = "testing")]
+    {
+        // we need to synchronize with drop completion for python tests without polling for
+        // log messages
+        if let Ok(result) = result.as_mut() {
+            let mut js = tokio::task::JoinSet::new();
+            for layer in std::mem::take(&mut result.doomed_layers) {
+                js.spawn(layer.wait_drop());
+            }
+            tracing::info!(
+                total = js.len(),
+                "starting to wait for the gc'd layers to be dropped"
+            );
+            while let Some(res) = js.join_next().await {
+                res.expect("wait_drop should not panic");
+            }
+        }
+
+        let timeline = tenant.get_timeline(timeline_id, false).ok();
+        let rtc = timeline.as_ref().map(|x| &x.remote_client);
+
+        if let Some(rtc) = rtc {
+            // layer drops schedule actions on remote timeline client to actually do the
+            // deletions; don't care about the shutdown error, just exit fast
+            drop(rtc.wait_completion().await);
+        }
+    }
+
+    result.map_err(|e| match e {
+        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+        GcError::TimelineNotFound => {
+            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+        }
+        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -27,7 +27,7 @@ use crate::tenant::Generation;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
@@ -153,9 +153,7 @@ async fn download_object<'a>(
                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                    .map_err(DownloadError::Other)?;

-                let download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
+                let download = storage.download(src_path, cancel).await?;

                pausable_failpoint!("before-downloading-layer-stream-pausable");

@@ -206,9 +204,7 @@ async fn download_object<'a>(
                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                    .map_err(DownloadError::Other)?;

-                let mut download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
+                let mut download = storage.download(src_path, cancel).await?;

                pausable_failpoint!("before-downloading-layer-stream-pausable");

@@ -348,9 +344,7 @@ async fn do_download_index_part(

    let index_part_bytes = download_retry_forever(
        || async {
-            let download = storage
-                .download(&remote_path, &DownloadOpts::default(), cancel)
-                .await?;
+            let download = storage.download(&remote_path, cancel).await?;

            let mut bytes = Vec::new();

@@ -532,15 +526,10 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = match storage
-                .download(&remote_path, &DownloadOpts::default(), cancel)
-                .await
-            {
+            let download = match storage.download(&remote_path, cancel).await {
                Ok(dl) => dl,
                Err(DownloadError::NotFound) => {
-                    storage
-                        .download(&remote_preserved_path, &DownloadOpts::default(), cancel)
-                        .await?
+                    storage.download(&remote_preserved_path, cancel).await?
                }
                Err(other) => Err(other)?,
            };
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -49,7 +49,7 @@ use futures::Future;
 use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -944,34 +944,36 @@ impl<'a> TenantDownloader<'a> {
    ) -> Result<HeatMapDownload, UpdateError> {
        debug_assert_current_span_has_tenant_id();
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: pull up etag check into the request, to do a conditional GET rather than
+        // issuing a GET and then maybe ignoring the response body
+        // (https://github.com/neondatabase/neon/issues/6199)
        tracing::debug!("Downloading heatmap for secondary tenant",);

        let heatmap_path = remote_heatmap_path(tenant_shard_id);
        let cancel = &self.secondary_state.cancel;
-        let opts = DownloadOpts {
-            etag: prev_etag.cloned(),
-        };

        backoff::retry(
            || async {
-                let download = match self
+                let download = self
                    .remote_storage
-                    .download(&heatmap_path, &opts, cancel)
+                    .download(&heatmap_path, cancel)
                    .await
-                {
-                    Ok(download) => download,
-                    Err(DownloadError::Unmodified) => return Ok(HeatMapDownload::Unmodified),
-                    Err(err) => return Err(err.into()),
-                };
+                    .map_err(UpdateError::from)?;

-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                Ok(HeatMapDownload::Modified(HeatMapModified {
-                    etag: download.etag,
-                    last_modified: download.last_modified,
-                    bytes: heatmap_bytes,
-                }))
+                SECONDARY_MODE.download_heatmap.inc();
+
+                if Some(&download.etag) == prev_etag {
+                    Ok(HeatMapDownload::Unmodified)
+                } else {
+                    let mut heatmap_bytes = Vec::new();
+                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    Ok(HeatMapDownload::Modified(HeatMapModified {
+                        etag: download.etag,
+                        last_modified: download.last_modified,
+                        bytes: heatmap_bytes,
+                    }))
+                }
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
            FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -982,7 +984,6 @@ impl<'a> TenantDownloader<'a> {
        .await
        .ok_or_else(|| UpdateError::Cancelled)
        .and_then(|x| x)
-        .inspect(|_| SECONDARY_MODE.download_heatmap.inc())
    }

    /// Download heatmap layers that are not present on local disk, or update their
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -43,17 +43,16 @@ use crate::tenant::vectored_blob_io::{
    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
@@ -573,7 +572,7 @@ impl DeltaLayerWriterInner {
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path,
+            file.path(),
            metadata.len()
        );

@@ -791,7 +790,7 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;

@@ -964,25 +963,14 @@ impl DeltaLayerInner {
                .blobs_at
                .as_slice()
                .iter()
-                .filter_map(|(_, blob_meta)| {
-                    if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
-                        // The size of values for these keys is unbounded and can
-                        // grow very large in pathological cases.
-                        None
-                    } else {
-                        Some(format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                    }
-                })
+                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                .join(", ");
-
-            if !offenders.is_empty() {
-                tracing::warn!(
-                    "Oversized vectored read ({} > {}) for keys {}",
-                    largest_read_size,
-                    read_size_soft_max,
-                    offenders
-                );
-            }
+            tracing::warn!(
+                "Oversized vectored read ({} > {}) for keys {}",
+                largest_read_size,
+                read_size_soft_max,
+                offenders
+            );
        }

        largest_read_size
@@ -1003,7 +991,8 @@ impl DeltaLayerInner {
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1022,7 +1011,7 @@ impl DeltaLayerInner {
                            blob_meta.key,
                            PageReconstructError::Other(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
@@ -1030,7 +1019,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

                    continue;
                }
@@ -1048,7 +1037,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to decompress blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1066,7 +1055,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1198,14 +1187,14 @@ impl DeltaLayerInner {
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

        let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
-        let align = virtual_file::get_io_buffer_alignment();

        let max_read_size = self
            .max_vectored_read_bytes
            .map(|x| x.0.get())
            .unwrap_or(8192);

-        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buffer = Some(IoBufferMut::with_capacity_aligned(max_read_size, align));

        // FIXME: buffering of DeltaLayerWriter
        let mut per_blob_copy = Vec::new();
@@ -1564,12 +1553,12 @@ impl<'a> DeltaLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let align = virtual_file::get_io_buffer_alignment();
+        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
        for meta in blobs_buf.blobs.iter() {
            let blob_read = meta.read(&view).await?;
            let value = Value::des(&blob_read)?;
@@ -1944,7 +1933,9 @@ pub(crate) mod test {
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
+
+            let align = virtual_file::get_io_buffer_alignment();
+            let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));

            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -40,16 +40,16 @@ use crate::tenant::vectored_blob_io::{
    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
@@ -389,7 +389,7 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
@@ -543,14 +543,15 @@ impl ImageLayerInner {
            .await?;

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let align = virtual_file::get_io_buffer_alignment();
        let mut key_count = 0;
        for read in plan.into_iter() {
            let buf_size = read.size();

-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-            let frozen_buf = blobs_buf.buf.freeze();
-            let view = BufView::new_bytes(frozen_buf);
+
+            let view = BufView::new_slice(&blobs_buf.buf);

            for meta in blobs_buf.blobs.iter() {
                let img_buf = meta.read(&view).await?;
@@ -588,34 +589,23 @@ impl ImageLayerInner {
                    .blobs_at
                    .as_slice()
                    .iter()
-                    .filter_map(|(_, blob_meta)| {
-                        if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
-                            // The size of values for these keys is unbounded and can
-                            // grow very large in pathological cases.
-                            None
-                        } else {
-                            Some(format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                        }
-                    })
+                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                    .join(", ");
-
-                if !offenders.is_empty() {
-                    tracing::warn!(
-                        "Oversized vectored read ({} > {}) for keys {}",
-                        buf_size,
-                        max_vectored_read_bytes,
-                        offenders
-                    );
-                }
+                tracing::warn!(
+                    "Oversized vectored read ({} > {}) for keys {}",
+                    buf_size,
+                    max_vectored_read_bytes,
+                    offenders
+                );
            }

-            let buf = BytesMut::with_capacity(buf_size);
+            let align = virtual_file::get_io_buffer_alignment();
+            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

            match res {
                Ok(blobs_buf) => {
-                    let frozen_buf = blobs_buf.buf.freeze();
-                    let view = BufView::new_bytes(frozen_buf);
+                    let view = BufView::new_slice(&blobs_buf.buf);
                    for meta in blobs_buf.blobs.iter() {
                        let img_buf = meta.read(&view).await;

@@ -626,7 +616,7 @@ impl ImageLayerInner {
                                    meta.meta.key,
                                    PageReconstructError::Other(anyhow!(e).context(format!(
                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path,
+                                        self.file.path(),
                                    ))),
                                );

@@ -647,7 +637,7 @@ impl ImageLayerInner {
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
@@ -1051,12 +1041,12 @@ impl<'a> ImageLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let align = virtual_file::get_io_buffer_alignment();
+        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
        for meta in blobs_buf.blobs.iter() {
            let img_buf = meta.read(&view).await?;
            next_batch.push_back((
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -442,13 +442,11 @@ impl Layer {
            // Visibility was modified to Visible: maybe log about this
            match ctx.task_kind() {
                TaskKind::CalculateSyntheticSize
-                | TaskKind::OndemandLogicalSizeCalculation
                | TaskKind::GarbageCollector
                | TaskKind::MgmtRequest => {
                    // This situation is expected in code paths do binary searches of the LSN space to resolve
                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
-                    // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
-                    // because it is run as a sub-task of synthetic size.
+                    // and on-demand for certain HTTP API requests.
                }
                _ => {
                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
@@ -458,8 +456,8 @@ impl Layer {
                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
                    // which was covered by a concurrent compaction.
                    tracing::info!(
-                        layer=%self,
-                        "became visible as a result of access",
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
                    );
                }
            }
@@ -688,9 +686,7 @@ impl Drop for LayerInner {
            // and we could be delaying shutdown for nothing.
        }

-        let timeline = self.timeline.upgrade();
-
-        if let Some(timeline) = timeline.as_ref() {
+        if let Some(timeline) = self.timeline.upgrade() {
            // Only need to decrement metrics if the timeline still exists: otherwise
            // it will have already de-registered these metrics via TimelineMetrics::shutdown
            if self.desc.is_delta() {
@@ -721,6 +717,7 @@ impl Drop for LayerInner {
        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().layer_name();
        let file_size = self.layer_desc().file_size;
+        let timeline = self.timeline.clone();
        let meta = self.metadata();
        let status = self.status.take();

@@ -730,7 +727,7 @@ impl Drop for LayerInner {
            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;

-            let Some(timeline) = timeline else {
+            let Some(timeline) = timeline.upgrade() else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,6 +48,7 @@ use utils::{
    sync::gate::{Gate, GateGuard},
 };

+use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
@@ -61,7 +62,6 @@ use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
 };
-use std::{pin::pin, sync::OnceLock};

 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -71,7 +71,6 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
    },
-    walingest::WalLagCooldown,
    walredo,
 };
 use crate::{
@@ -430,8 +429,6 @@ pub struct Timeline {
    pub(crate) l0_flush_global_state: L0FlushGlobalState,

    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
-
-    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }

 pub struct WalReceiverInfo {
@@ -740,7 +737,6 @@ pub enum GetLogicalSizePriority {
 pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
-    ForceL0Compaction,
    EnhancedGcBottomMostCompaction,
    DryRun,
 }
@@ -2134,7 +2130,6 @@ impl Timeline {
        pg_version: u32,
        state: TimelineState,
        aux_file_policy: Option<AuxFilePolicy>,
-        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2276,8 +2271,6 @@ impl Timeline {
                l0_flush_global_state: resources.l0_flush_global_state,

                handles: Default::default(),
-
-                attach_wal_lag_cooldown,
            };

            if aux_file_policy == Some(AuxFilePolicy::V1) {
--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -11,7 +11,6 @@ pub(crate) struct RangeAnalysis {
    has_image: bool,
    num_of_deltas_above_image: usize,
    total_num_of_deltas: usize,
-    num_of_l0: usize,
 }

 impl Timeline {
@@ -21,10 +20,8 @@ impl Timeline {
        let mut delta_ranges = Vec::new();
        let mut image_ranges = Vec::new();

-        let num_of_l0;
        let all_layer_files = {
            let guard = self.layers.read().await;
-            num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
            guard.all_persistent_layers()
        };
        let lsn = self.get_last_record_lsn();
@@ -85,7 +82,6 @@ impl Timeline {
                has_image: image_layer.is_some(),
                num_of_deltas_above_image: maybe_delta_layers.len(),
                total_num_of_deltas: pitr_delta_layers.len(),
-                num_of_l0,
            });
        }

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -353,13 +353,7 @@ impl Timeline {

                // 2. Compact
                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self
-                    .compact_level0(
-                        target_file_size,
-                        flags.contains(CompactFlags::ForceL0Compaction),
-                        ctx,
-                    )
-                    .await?;
+                let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
                timer.stop_and_record();

                let mut partitioning = dense_partitioning;
@@ -664,7 +658,6 @@ impl Timeline {
    async fn compact_level0(
        self: &Arc<Self>,
        target_file_size: u64,
-        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        let CompactLevel0Phase1Result {
@@ -686,15 +679,9 @@ impl Timeline {
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(
-                phase1_layers_locked,
-                stats,
-                target_file_size,
-                force_compaction_ignore_threshold,
-                &ctx,
-            )
-            .instrument(phase1_span)
-            .await?
+            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
+                .instrument(phase1_span)
+                .await?
        };

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
@@ -713,7 +700,6 @@ impl Timeline {
        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
-        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        stats.read_lock_held_spawn_blocking_startup_micros =
@@ -725,26 +711,11 @@ impl Timeline {
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            if force_compaction_ignore_threshold {
-                if !level0_deltas.is_empty() {
-                    info!(
-                        level0_deltas = level0_deltas.len(),
-                        threshold, "too few deltas to compact, but forcing compaction"
-                    );
-                } else {
-                    info!(
-                        level0_deltas = level0_deltas.len(),
-                        threshold, "too few deltas to compact, cannot force compaction"
-                    );
-                    return Ok(CompactLevel0Phase1Result::default());
-                }
-            } else {
-                debug!(
-                    level0_deltas = level0_deltas.len(),
-                    threshold, "too few deltas to compact"
-                );
-                return Ok(CompactLevel0Phase1Result::default());
-            }
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
        }

        let mut level0_deltas = level0_deltas
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -18,7 +18,7 @@
 use std::collections::BTreeMap;
 use std::ops::Deref;

-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;

 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::{self, VirtualFile};

 /// Metadata bundled with the start and end offset of a blob.
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
 /// Return type of [`VectoredBlobReader::read_blobs`]
 pub struct VectoredBlobsBuf {
    /// Buffer for all blobs in this read
-    pub buf: BytesMut,
+    pub buf: IoBufferMut,
    /// Offsets into the buffer and metadata for all blobs in this read
    pub blobs: Vec<VectoredBlob>,
 }
@@ -460,7 +461,7 @@ impl<'a> VectoredBlobReader<'a> {
    pub async fn read_blobs(
        &self,
        read: &VectoredRead,
-        buf: BytesMut,
+        buf: IoBufferMut,
        ctx: &RequestContext,
    ) -> Result<VectoredBlobsBuf, std::io::Error> {
        assert!(read.size() > 0);
@@ -945,7 +946,8 @@ mod tests {

        // Multiply by two (compressed data might need more space), and add a few bytes for the header
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let mut buf = BytesMut::with_capacity(reserved_bytes);
+        let align = virtual_file::get_io_buffer_alignment();
+        let mut buf = IoBufferMut::with_capacity_aligned(reserved_bytes, align);

        let align = virtual_file::get_io_buffer_alignment();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+#[cfg(target_os = "linux")]
+use std::os::unix::fs::OpenOptionsExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;

@@ -38,10 +40,11 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
+pub(crate) use api::IoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
+pub(crate) mod dio;

 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
@@ -53,6 +56,7 @@ pub(crate) mod owned_buffers_io {
    //! but for the time being we're proving out the primitives in the neon.git repo
    //! for faster iteration.

+    pub(crate) mod io_buf_aligned;
    pub(crate) mod io_buf_ext;
    pub(crate) mod slice;
    pub(crate) mod write;
@@ -61,6 +65,176 @@ pub(crate) mod owned_buffers_io {
    }
 }

+#[derive(Debug)]
+pub enum VirtualFile {
+    Buffered(VirtualFileInner),
+    Direct(VirtualFileInner),
+}
+
+impl VirtualFile {
+    fn inner(&self) -> &VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+
+    fn inner_mut(&mut self) -> &mut VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+
+    fn into_inner(self) -> VirtualFileInner {
+        match self {
+            Self::Buffered(file) => file,
+            Self::Direct(file) => file,
+        }
+    }
+    /// Open a file in read-only mode. Like File::open.
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::open(path, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    /// Open a file in read-only mode. Like File::open.
+    ///
+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
+    pub async fn open_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
+    }
+
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::create(path, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    pub async fn create_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        VirtualFile::open_with_options_v2(
+            path.as_ref(),
+            OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
+        )
+        .await
+    }
+
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+        Ok(Self::Buffered(file))
+    }
+
+    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`.
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = match get_io_mode() {
+            IoMode::Buffered => {
+                let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+                Self::Buffered(file)
+            }
+            #[cfg(target_os = "linux")]
+            IoMode::Direct => {
+                let file = VirtualFileInner::open_with_options(
+                    path,
+                    open_options.custom_flags(nix::libc::O_DIRECT),
+                    ctx,
+                )
+                .await?;
+                Self::Direct(file)
+            }
+        };
+        Ok(file)
+    }
+
+    pub fn path(&self) -> &Utf8Path {
+        self.inner().path.as_path()
+    }
+
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
+        content: B,
+    ) -> std::io::Result<()> {
+        VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
+    }
+
+    pub async fn sync_all(&self) -> Result<(), Error> {
+        self.inner().sync_all().await
+    }
+
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        self.inner().sync_data().await
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        self.inner().metadata().await
+    }
+
+    pub fn remove(self) {
+        self.into_inner().remove();
+    }
+
+    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        self.inner_mut().seek(pos).await
+    }
+
+    pub async fn read_exact_at<Buf>(
+        &self,
+        slice: Slice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Slice<Buf>, Error>
+    where
+        Buf: IoBufMut + Send,
+    {
+        self.inner().read_exact_at(slice, offset, ctx).await
+    }
+
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        self.inner().read_exact_at_page(page, offset, ctx).await
+    }
+
+    pub async fn write_all_at<Buf: IoBuf + Send>(
+        &self,
+        buf: FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        self.inner().write_all_at(buf, offset, ctx).await
+    }
+
+    pub async fn write_all<Buf: IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        self.inner_mut().write_all(buf, ctx).await
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -77,7 +251,7 @@ pub(crate) mod owned_buffers_io {
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
 #[derive(Debug)]
-pub struct VirtualFile {
+pub struct VirtualFileInner {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
    /// belongs to a different VirtualFile.
@@ -350,12 +524,12 @@ macro_rules! with_file {
    }};
 }

-impl VirtualFile {
+impl VirtualFileInner {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

@@ -364,7 +538,7 @@ impl VirtualFile {
    pub async fn create<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(
            path.as_ref(),
            OpenOptions::new().write(true).create(true).truncate(true),
@@ -382,7 +556,7 @@ impl VirtualFile {
        path: P,
        open_options: &OpenOptions,
        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        let path_ref = path.as_ref();
        let path_str = path_ref.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -413,7 +587,7 @@ impl VirtualFile {
            open_options.open(path_ref.as_std_path()).await?
        });

-        // Strip all options other than read and write.
+        // Strip all options other than read and write (O_DIRECT).
        //
        // It would perhaps be nicer to check just for the read and write flags
        // explicitly, but OpenOptions doesn't contain any functions to read flags,
@@ -423,7 +597,7 @@ impl VirtualFile {
        reopen_options.create_new(false);
        reopen_options.truncate(false);

-        let vfile = VirtualFile {
+        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
            pos: 0,
            path: path_ref.to_path_buf(),
@@ -1034,6 +1208,21 @@ impl tokio_epoll_uring::IoFd for FileGuard {

 #[cfg(test)]
 impl VirtualFile {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+        self.inner().read_blk(blknum, ctx).await
+    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner_mut().read_to_end(buf, ctx).await
+    }
+}
+
+#[cfg(test)]
+impl VirtualFileInner {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -1067,7 +1256,7 @@ impl VirtualFile {
    }
 }

-impl Drop for VirtualFile {
+impl Drop for VirtualFileInner {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
        let handle = self.handle.get_mut();
@@ -1216,6 +1405,15 @@ pub(crate) fn get_io_buffer_alignment() -> usize {
    }
 }

+static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+
+pub(crate) fn set_io_mode(mode: IoMode) {
+    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
+}
+
+pub(crate) fn get_io_mode() -> IoMode {
+    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
+}
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
@@ -1524,7 +1722,7 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(
+            let f = VirtualFileInner::open_with_options(
                &test_file_path,
                OpenOptions::new().read(true),
                &ctx,
@@ -1576,7 +1774,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1585,7 +1783,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1608,7 +1806,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();

--- a/pageserver/src/virtual_file/dio.rs
+++ b/pageserver/src/virtual_file/dio.rs
@@ -0,0 +1,410 @@
+#![allow(unused)]
+
+use core::slice;
+use std::{
+    alloc::{self, Layout},
+    cmp,
+    mem::{ManuallyDrop, MaybeUninit},
+    ops::{Deref, DerefMut},
+    ptr::{addr_of_mut, NonNull},
+};
+
+use bytes::buf::UninitSlice;
+
+struct IoBufferPtr(*mut u8);
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Send for IoBufferPtr {}
+
+/// An aligned buffer type used for I/O.
+pub struct IoBufferMut {
+    ptr: IoBufferPtr,
+    capacity: usize,
+    len: usize,
+    align: usize,
+}
+
+impl IoBufferMut {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity_aligned(capacity: usize, align: usize) -> Self {
+        let layout = Layout::from_size_align(capacity, align).expect("Invalid layout");
+
+        // SAFETY:  Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
+        let ptr = unsafe {
+            let ptr = alloc::alloc(layout);
+            if ptr.is_null() {
+                alloc::handle_alloc_error(layout);
+            }
+            IoBufferPtr(ptr)
+        };
+
+        IoBufferMut {
+            ptr,
+            capacity,
+            len: 0,
+            align,
+        }
+    }
+
+
+    /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
+    pub fn with_capacity_aligned_zeroed(capacity: usize, align: usize) -> Self {
+        use bytes::BufMut;
+        let mut buf = Self::with_capacity_aligned(capacity, align);
+        buf.put_bytes(0, capacity);
+        buf.len = capacity;
+        buf
+    }
+
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.align
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    unsafe fn set_len(&mut self, new_len: usize) {
+        debug_assert!(new_len <= self.capacity());
+        self.len = new_len;
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        self.ptr.0
+    }
+
+    #[inline]
+    fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.0
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        if additional > self.capacity() - self.len() {
+            self.reserve_inner(additional);
+        }
+    }
+
+    fn reserve_inner(&mut self, additional: usize) {
+        let Some(required_cap) = self.len().checked_add(additional) else {
+            capacity_overflow()
+        };
+
+        let old_capacity = self.capacity();
+        let align = self.align();
+        // This guarantees exponential growth. The doubling cannot overflow
+        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+        let cap = cmp::max(old_capacity * 2, required_cap);
+
+        if !is_valid_alloc(cap) {
+            capacity_overflow()
+        }
+        let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
+
+        let old_ptr = self.as_mut_ptr();
+
+        // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
+        // and we panics on null pointer.
+        let (ptr, cap) = unsafe {
+            let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
+            let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
+            if ptr.is_null() {
+                alloc::handle_alloc_error(new_layout);
+            }
+            (IoBufferPtr(ptr), cap)
+        };
+
+        self.ptr = ptr;
+        self.capacity = cap;
+    }
+
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        let mut buf = ManuallyDrop::new(self);
+        // SAFETY: leaking the buffer as intended.
+        unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
+    }
+}
+
+fn capacity_overflow() -> ! {
+    panic!("capacity overflow")
+}
+
+// We need to guarantee the following:
+// * We don't ever allocate `> isize::MAX` byte-size objects.
+// * We don't overflow `usize::MAX` and actually allocate too little.
+//
+// On 64-bit we just need to check for overflow since trying to allocate
+// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
+// an extra guard for this in case we're running on a platform which can use
+// all 4GB in user-space, e.g., PAE or x32.
+#[inline]
+fn is_valid_alloc(alloc_size: usize) -> bool {
+    !(usize::BITS < 64 && alloc_size > isize::MAX as usize)
+}
+
+impl Drop for IoBufferMut {
+    fn drop(&mut self) {
+        // SAFETY: memory was allocated with std::alloc::alloc with the same layout.
+        unsafe {
+            alloc::dealloc(
+                self.as_mut_ptr(),
+                Layout::from_size_align_unchecked(self.capacity, self.align),
+            )
+        }
+    }
+}
+
+impl Deref for IoBufferMut {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl DerefMut for IoBufferMut {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.as_mut_slice()
+    }
+}
+
+/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
+unsafe impl bytes::BufMut for IoBufferMut {
+    #[inline]
+    fn remaining_mut(&self) -> usize {
+        // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
+        // Thus, it can have at most `self.capacity` bytes.
+        self.capacity() - self.len()
+    }
+
+    // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
+    #[inline]
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        let len: usize = self.len();
+        let remaining = self.remaining_mut();
+
+        if remaining < cnt {
+            panic_advance(cnt, remaining);
+        }
+
+        // Addition will not overflow since the sum is at most the capacity.
+        self.set_len(len + cnt);
+    }
+
+    #[inline]
+    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
+        let cap = self.capacity();
+        let len = self.len();
+
+        // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
+        // valid for `cap - len` bytes. The subtraction will not underflow since
+        // `len <= cap`.
+        unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) }
+    }
+}
+
+/// Panic with a nice error message.
+#[cold]
+fn panic_advance(idx: usize, len: usize) -> ! {
+    panic!(
+        "advance out of bounds: the len is {} but advancing by {}",
+        len, idx
+    );
+}
+
+/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer,
+/// and the location remains stable even if [`Self`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for IoBufferMut {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.capacity()
+    }
+}
+
+// SAFETY: See above.
+unsafe impl tokio_epoll_uring::IoBufMut for IoBufferMut {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, init_len: usize) {
+        if self.len() < init_len {
+            self.set_len(init_len);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_with_capacity_aligned() {
+        const ALIGN: usize = 4 * 1024;
+        let v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+
+        let v = IoBufferMut::with_capacity_aligned(ALIGN / 2, ALIGN);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN / 2);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    fn test_with_capacity_aligned_zeroed() {
+        const ALIGN: usize = 4 * 1024;
+        let v = IoBufferMut::with_capacity_aligned_zeroed(ALIGN, ALIGN);
+        assert_eq!(v.len(), ALIGN);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+        assert_eq!(&v[..], &[0; ALIGN])
+    }
+
+    #[test]
+    fn test_reserve() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
+        let capacity = v.capacity();
+        v.reserve(capacity);
+        assert_eq!(v.capacity(), capacity);
+        let data = [b'a'; ALIGN];
+        v.put(&data[..]);
+        v.reserve(capacity);
+        assert!(v.capacity() >= capacity * 2);
+        assert_eq!(&v[..], &data[..]);
+        let capacity = v.capacity();
+        v.clear();
+        v.reserve(capacity);
+        assert_eq!(capacity, v.capacity());
+    }
+
+    #[test]
+    fn test_bytes_put() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            for _ in 0..4 {
+                v.put(&x[..]);
+            }
+            assert_eq!(v.len(), ALIGN * 4);
+            assert_eq!(v.capacity(), ALIGN * 4);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bytes_put_panic() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
+        let x = [b'a'; ALIGN];
+        for _ in 0..5 {
+            v.put_slice(&x[..]);
+        }
+    }
+
+    #[test]
+    fn test_io_buf_put_slice() {
+        use tokio_epoll_uring::BoundedBufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            v.put_slice(&x[..]);
+            assert_eq!(v.len(), ALIGN);
+            assert_eq!(v.capacity(), ALIGN);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -0,0 +1,9 @@
+#![allow(unused)]
+
+use tokio_epoll_uring::IoBufMut;
+
+use crate::virtual_file::dio::IoBufferMut;
+
+pub(crate) trait IoBufAlignedMut: IoBufMut {}
+
+impl IoBufAlignedMut for IoBufferMut {}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,5 +1,6 @@
 //! See [`FullSlice`].

+use crate::virtual_file::dio::IoBufferMut;
 use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
@@ -76,3 +77,4 @@ macro_rules! impl_io_buf_ext {
 impl_io_buf_ext!(Bytes);
 impl_io_buf_ext!(BytesMut);
 impl_io_buf_ext!(Vec<u8>);
+impl_io_buf_ext!(IoBufferMut);
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,10 +21,7 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

-use std::sync::Arc;
-use std::sync::OnceLock;
 use std::time::Duration;
-use std::time::Instant;
 use std::time::SystemTime;

 use pageserver_api::shard::ShardIdentity;
@@ -72,29 +69,7 @@ impl CheckPoint {
    }
 }

-/// Temporary limitation of WAL lag warnings after attach
-///
-/// After tenant attach, we want to limit WAL lag warnings because
-/// we don't look at the WAL until the attach is complete, which
-/// might take a while.
-pub struct WalLagCooldown {
-    /// Until when should this limitation apply at all
-    active_until: std::time::Instant,
-    /// The maximum lag to suppress. Lags above this limit get reported anyways.
-    max_lag: Duration,
-}
-
-impl WalLagCooldown {
-    pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
-        Self {
-            active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
-            max_lag: attach_duration * 2 + Duration::from_secs(60),
-        }
-    }
-}
-
 pub struct WalIngest {
-    attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
    shard: ShardIdentity,
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
@@ -128,7 +103,6 @@ impl WalIngest {
            shard: *timeline.get_shard_identity(),
            checkpoint,
            checkpoint_modified: false,
-            attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
            warn_ingest_lag: WarnIngestLag {
                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
@@ -1455,13 +1429,6 @@ impl WalIngest {
                    Ok(lag) => {
                        if lag > conf.wait_lsn_timeout {
                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
-                                if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
-                                    if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
-                                        return;
-                                    }
-                                } else {
-                                    // Still loading? We shouldn't be here
-                                }
                                let lag = humantime::format_duration(lag);
                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                            })
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -146,8 +146,6 @@ ConstructDeltaMessage()
 	if (RootTable.role_table)
 	{
 		JsonbValue	roles;
-		HASH_SEQ_STATUS status;
-		RoleEntry  *entry;

 		roles.type = jbvString;
 		roles.val.string.val = "roles";
@@ -155,6 +153,9 @@ ConstructDeltaMessage()
 		pushJsonbValue(&state, WJB_KEY, &roles);
 		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);

+		HASH_SEQ_STATUS status;
+		RoleEntry  *entry;
+
 		hash_seq_init(&status, RootTable.role_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -189,12 +190,10 @@ ConstructDeltaMessage()
 		}
 		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
 	}
-	{
-		JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
-		Jsonb	   *jsonb = JsonbValueToJsonb(result);
+	JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+	Jsonb	   *jsonb = JsonbValueToJsonb(result);

-		return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
-	}
+	return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
 }

 #define ERROR_SIZE 1024
@@ -273,28 +272,32 @@ SendDeltasToControlPlane()
 		curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
 	}

+	char	   *message = ConstructDeltaMessage();
+	ErrorString str;
+
+	str.size = 0;
+
+	curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str);
+
+	const int	num_retries = 5;
+	CURLcode	curl_status;
+
+	for (int i = 0; i < num_retries; i++)
+	{
+		if ((curl_status = curl_easy_perform(handle)) == 0)
+			break;
+		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
+		pg_usleep(1000 * 1000);
+	}
+	if (curl_status != CURLE_OK)
+	{
+		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
+	}
+	else
 	{
-		char	   *message = ConstructDeltaMessage();
-		ErrorString str;
-		const int	num_retries = 5;
-		CURLcode	curl_status;
 		long		response_code;

-		str.size = 0;
-
-		curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message);
-		curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str);
-
-		for (int i = 0; i < num_retries; i++)
-		{
-			if ((curl_status = curl_easy_perform(handle)) == 0)
-				break;
-			elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
-			pg_usleep(1000 * 1000);
-		}
-		if (curl_status != CURLE_OK)
-			elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
-
 		if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
 		{
 			if (response_code != 200)
@@ -373,11 +376,10 @@ MergeTable()

 	if (old_table->db_table)
 	{
+		InitDbTableIfNeeded();
 		DbEntry    *entry;
 		HASH_SEQ_STATUS status;

-		InitDbTableIfNeeded();
-
 		hash_seq_init(&status, old_table->db_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -419,11 +421,10 @@ MergeTable()

 	if (old_table->role_table)
 	{
+		InitRoleTableIfNeeded();
 		RoleEntry  *entry;
 		HASH_SEQ_STATUS status;

-		InitRoleTableIfNeeded();
-
 		hash_seq_init(&status, old_table->role_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -514,12 +515,9 @@ RoleIsNeonSuperuser(const char *role_name)
 static void
 HandleCreateDb(CreatedbStmt *stmt)
 {
+	InitDbTableIfNeeded();
 	DefElem    *downer = NULL;
 	ListCell   *option;
-	bool		found = false;
-	DbEntry    *entry;
-
-	InitDbTableIfNeeded();

 	foreach(option, stmt->options)
 	{
@@ -528,11 +526,13 @@ HandleCreateDb(CreatedbStmt *stmt)
 		if (strcmp(defel->defname, "owner") == 0)
 			downer = defel;
 	}
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);

-	entry = hash_search(CurrentDdlTable->db_table,
-						stmt->dbname,
-						HASH_ENTER,
-						&found);
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));

@@ -554,24 +554,21 @@ HandleCreateDb(CreatedbStmt *stmt)
 static void
 HandleAlterOwner(AlterOwnerStmt *stmt)
 {
-	const char *name;
-	bool		found = false;
-	DbEntry    *entry;
-	const char *new_owner;
-
 	if (stmt->objectType != OBJECT_DATABASE)
 		return;
 	InitDbTableIfNeeded();
+	const char *name = strVal(stmt->object);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									name,
+									HASH_ENTER,
+									&found);

-	name = strVal(stmt->object);
-	entry = hash_search(CurrentDdlTable->db_table,
-						name,
-						HASH_ENTER,
-						&found);
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
+	const char *new_owner = get_rolespec_name(stmt->newowner);

-	new_owner = get_rolespec_name(stmt->newowner);
 	if (RoleIsNeonSuperuser(new_owner))
 		elog(ERROR, "can't alter owner to neon_superuser");
 	entry->owner = get_role_oid(new_owner, false);
@@ -581,23 +578,21 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
 static void
 HandleDbRename(RenameStmt *stmt)
 {
-	bool		found = false;
-	DbEntry    *entry;
-	DbEntry    *entry_for_new_name;
-
 	Assert(stmt->renameType == OBJECT_DATABASE);
 	InitDbTableIfNeeded();
-	entry = hash_search(CurrentDdlTable->db_table,
-						stmt->subname,
-						HASH_FIND,
-						&found);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+	DbEntry    *entry_for_new_name = hash_search(
+												 CurrentDdlTable->db_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);

-	entry_for_new_name = hash_search(CurrentDdlTable->db_table,
-									 stmt->newname,
-									 HASH_ENTER,
-									 NULL);
 	entry_for_new_name->type = Op_Set;
-
 	if (found)
 	{
 		if (entry->old_name[0] != '\0')
@@ -605,7 +600,8 @@ HandleDbRename(RenameStmt *stmt)
 		else
 			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
 		entry_for_new_name->owner = entry->owner;
-		hash_search(CurrentDdlTable->db_table,
+		hash_search(
+					CurrentDdlTable->db_table,
 					stmt->subname,
 					HASH_REMOVE,
 					NULL);
@@ -620,15 +616,14 @@ HandleDbRename(RenameStmt *stmt)
 static void
 HandleDropDb(DropdbStmt *stmt)
 {
-	bool		found = false;
-	DbEntry    *entry;
-
 	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);

-	entry = hash_search(CurrentDdlTable->db_table,
-						stmt->dbname,
-						HASH_ENTER,
-						&found);
 	entry->type = Op_Delete;
 	entry->owner = InvalidOid;
 	if (!found)
@@ -638,14 +633,16 @@ HandleDropDb(DropdbStmt *stmt)
 static void
 HandleCreateRole(CreateRoleStmt *stmt)
 {
+	InitRoleTableIfNeeded();
 	bool		found = false;
-	RoleEntry  *entry;
-	DefElem    *dpass;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role,
+									HASH_ENTER,
+									&found);
+	DefElem    *dpass = NULL;
 	ListCell   *option;

-	InitRoleTableIfNeeded();
-
-	dpass = NULL;
 	foreach(option, stmt->options)
 	{
 		DefElem    *defel = lfirst(option);
@@ -653,11 +650,6 @@ HandleCreateRole(CreateRoleStmt *stmt)
 		if (strcmp(defel->defname, "password") == 0)
 			dpass = defel;
 	}
-
-	entry = hash_search(CurrentDdlTable->role_table,
-						stmt->role,
-						HASH_ENTER,
-						&found);
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 	if (dpass && dpass->arg)
@@ -670,18 +662,14 @@ HandleCreateRole(CreateRoleStmt *stmt)
 static void
 HandleAlterRole(AlterRoleStmt *stmt)
 {
-	const char *role_name = stmt->role->rolename;
-	DefElem    *dpass;
-	ListCell   *option;
-	bool		found = false;
-	RoleEntry  *entry;
-
 	InitRoleTableIfNeeded();
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+	const char *role_name = stmt->role->rolename;

 	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");

-	dpass = NULL;
 	foreach(option, stmt->options)
 	{
 		DefElem    *defel = lfirst(option);
@@ -692,11 +680,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	/* We only care about updates to the password */
 	if (!dpass)
 		return;
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									role_name,
+									HASH_ENTER,
+									&found);

-	entry = hash_search(CurrentDdlTable->role_table,
-						role_name,
-						HASH_ENTER,
-						&found);
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 	if (dpass->arg)
@@ -709,22 +699,20 @@ HandleAlterRole(AlterRoleStmt *stmt)
 static void
 HandleRoleRename(RenameStmt *stmt)
 {
-	bool		found = false;
-	RoleEntry  *entry;
-	RoleEntry  *entry_for_new_name;
-
-	Assert(stmt->renameType == OBJECT_ROLE);
 	InitRoleTableIfNeeded();
+	Assert(stmt->renameType == OBJECT_ROLE);
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);

-	entry = hash_search(CurrentDdlTable->role_table,
-						stmt->subname,
-						HASH_FIND,
-						&found);
-
-	entry_for_new_name = hash_search(CurrentDdlTable->role_table,
-									 stmt->newname,
-									 HASH_ENTER,
-									 NULL);
+	RoleEntry  *entry_for_new_name = hash_search(
+												 CurrentDdlTable->role_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);

 	entry_for_new_name->type = Op_Set;
 	if (found)
@@ -750,9 +738,8 @@ HandleRoleRename(RenameStmt *stmt)
 static void
 HandleDropRole(DropRoleStmt *stmt)
 {
-	ListCell   *item;
-
 	InitRoleTableIfNeeded();
+	ListCell   *item;

 	foreach(item, stmt->roles)
 	{
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -42,7 +42,6 @@

 #include "hll.h"
 #include "bitmap.h"
-#include "neon.h"

 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

@@ -170,15 +169,11 @@ lfc_disable(char const *op)

 		if (lfc_desc > 0)
 		{
-			int			rc;
-
 			/*
 			 * If the reason of error is ENOSPC, then truncation of file may
 			 * help to reclaim some space
 			 */
-			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE);
-			rc = ftruncate(lfc_desc, 0);
-			pgstat_report_wait_end();
+			int			rc = ftruncate(lfc_desc, 0);

 			if (rc < 0)
 				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
@@ -668,6 +663,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
+	bool		result = true;
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
@@ -773,10 +769,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (iteration_hits != 0)
 		{
-			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
 			rc = preadv(lfc_desc, iov, blocks_in_chunk,
 						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-			pgstat_report_wait_end();

 			if (rc != (BLCKSZ * blocks_in_chunk))
 			{
@@ -926,10 +920,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
 				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
 				uint32		offset = hole->offset;
-				bool		hole_found;
+				bool		found;
 	
-				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
-				CriticalAssert(hole_found);
+				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+				CriticalAssert(found);
 	
 				lfc_ctl->used += 1;
 				entry->offset = offset;	/* reuse the hole */
@@ -950,11 +944,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);

-		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
 					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-		pgstat_report_wait_end();
-
 		if (rc != BLCKSZ * blocks_in_chunk)
 		{
 			lfc_disable("write");
@@ -1005,7 +996,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 	Datum		result;
 	HeapTuple	tuple;
 	char const *key;
-	uint64		value = 0;
+	uint64		value;
 	Datum		values[NUM_NEON_GET_STATS_COLS];
 	bool		nulls[NUM_NEON_GET_STATS_COLS];

--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -116,6 +116,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 {
 	uint8		count;
 	uint32		index;
+	size_t		i;
+	size_t		j;

 	TimestampTz	now = GetCurrentTimestamp();
 	/* Use the first "k" (registerWidth) bits as a zero based index */
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -89,6 +89,7 @@ typedef struct

 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static void walproposer_shmem_request(void);
 #endif
 static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
@@ -440,8 +441,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			return false;
 		}
 		shard->state = PS_Connecting_Startup;
+		/* fallthrough */
 	}
-	/* FALLTHROUGH */
 	case PS_Connecting_Startup:
 	{
 		char	   *pagestream_query;
@@ -452,6 +453,8 @@ pageserver_connect(shardno_t shard_no, int elevel)

 		do
 		{
+			WaitEvent	event;
+
 			switch (poll_result)
 			{
 			default: /* unknown/unused states are handled as a failed connection */
@@ -487,7 +490,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
 											   PQsocket(shard->conn),
 											   0,
-											   WAIT_EVENT_NEON_PS_STARTING);
+											   PG_WAIT_EXTENSION);
 					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
 					if (rc & WL_LATCH_SET)
 					{
@@ -509,7 +512,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
 											   PQsocket(shard->conn),
 											   0,
-											   WAIT_EVENT_NEON_PS_STARTING);
+											   PG_WAIT_EXTENSION);
 					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
 					if (rc & WL_LATCH_SET)
 					{
@@ -582,8 +585,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connecting_PageStream;
+		/* fallthrough */
 	}
-	/* FALLTHROUGH */
 	case PS_Connecting_PageStream:
 	{
 		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
@@ -605,8 +608,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-									WAIT_EVENT_NEON_PS_CONFIGURING);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -628,8 +630,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connected;
+		/* fallthrough */
 	}
-	/* FALLTHROUGH */
 	case PS_Connected:
 		/*
 		 * We successfully connected. Future connections to this PageServer
@@ -654,8 +656,7 @@ static int
 call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
-	PageServer *shard = &page_servers[shard_no];
-	PGconn	   *pageserver_conn = shard->conn;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -665,8 +666,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-								WAIT_EVENT_NEON_PS_READ);
+		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -937,7 +937,7 @@ PagestoreShmemInit(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	pagestore_shared = ShmemInitStruct("libpagestore shared state",
-									   sizeof(PagestoreShmemState),
+									   PagestoreShmemSize(),
 									   &found);
 	if (!found)
 	{
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -41,9 +41,6 @@
 #include "pagestore_client.h"
 #include "control_plane_connector.h"
 #include "walsender_hooks.h"
-#if PG_MAJORVERSION_NUM >= 16
-#include "storage/ipc.h"
-#endif

 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -52,23 +49,6 @@ static int	logical_replication_max_snap_files = 300;

 static int  running_xacts_overflow_policy;

-#if PG_MAJORVERSION_NUM >= 16
-static shmem_startup_hook_type prev_shmem_startup_hook;
-
-static void neon_shmem_startup_hook(void);
-#endif
-#if PG_MAJORVERSION_NUM >= 17
-uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
-uint32		WAIT_EVENT_NEON_LFC_READ;
-uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
-uint32		WAIT_EVENT_NEON_LFC_WRITE;
-uint32		WAIT_EVENT_NEON_PS_STARTING;
-uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
-uint32		WAIT_EVENT_NEON_PS_SEND;
-uint32		WAIT_EVENT_NEON_PS_READ;
-uint32		WAIT_EVENT_NEON_WAL_DL;
-#endif
-
 enum RunningXactsOverflowPolicies {
 	OP_IGNORE,
 	OP_SKIP,
@@ -655,9 +635,6 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = neon_shmem_startup_hook;
 #endif

 	pg_init_libpagestore();
@@ -744,25 +721,3 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
-
-#if PG_MAJORVERSION_NUM >= 16
-static void
-neon_shmem_startup_hook(void)
-{
-	/* Initialize */
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-#if PG_PG_MAJORVERSION_NUM >= 17
-	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
-	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
-	WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
-	WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write");
-	WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting");
-	WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring");
-	WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO");
-	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
-	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
-#endif
-}
-#endif
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,8 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.5'
+# TODO: bump default version to 1.5, after we are certain that we don't
+# need to rollback the compute image
+default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -12,7 +12,6 @@
 #ifndef NEON_H
 #define NEON_H
 #include "access/xlogreader.h"
-#include "utils/wait_event.h"

 /* GUCs */
 extern char *neon_auth_token;
@@ -23,28 +22,6 @@ extern char *wal_acceptors_list;
 extern int	wal_acceptor_reconnect_timeout;
 extern int	wal_acceptor_connection_timeout;

-#if PG_MAJORVERSION_NUM >= 17
-extern uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
-extern uint32		WAIT_EVENT_NEON_LFC_READ;
-extern uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
-extern uint32		WAIT_EVENT_NEON_LFC_WRITE;
-extern uint32		WAIT_EVENT_NEON_PS_STARTING;
-extern uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
-extern uint32		WAIT_EVENT_NEON_PS_SEND;
-extern uint32		WAIT_EVENT_NEON_PS_READ;
-extern uint32		WAIT_EVENT_NEON_WAL_DL;
-#else
-#define WAIT_EVENT_NEON_LFC_MAINTENANCE	PG_WAIT_EXTENSION
-#define WAIT_EVENT_NEON_LFC_READ		WAIT_EVENT_BUFFILE_READ
-#define WAIT_EVENT_NEON_LFC_TRUNCATE	WAIT_EVENT_BUFFILE_TRUNCATE
-#define WAIT_EVENT_NEON_LFC_WRITE		WAIT_EVENT_BUFFILE_WRITE
-#define WAIT_EVENT_NEON_PS_STARTING		PG_WAIT_EXTENSION
-#define WAIT_EVENT_NEON_PS_CONFIGURING	PG_WAIT_EXTENSION
-#define WAIT_EVENT_NEON_PS_SEND			PG_WAIT_EXTENSION
-#define WAIT_EVENT_NEON_PS_READ			PG_WAIT_EXTENSION
-#define WAIT_EVENT_NEON_WAL_DL			WAIT_EVENT_WAL_READ
-#endif
-
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -27,8 +27,7 @@ NeonPerfCountersShmemSize(void)
 {
 	Size		size = 0;

-	size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
-								   sizeof(neon_per_backend_counters)));
+	size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));

 	return size;
 }
@@ -40,7 +39,7 @@ NeonPerfCountersShmemInit(void)

 	neon_per_backend_counters_shared =
 		ShmemInitStruct("Neon perf counters",
-						mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
+						mul_size(MaxBackends,
 								 sizeof(neon_per_backend_counters)),
 						&found);
 	Assert(found == IsUnderPostmaster);
@@ -94,6 +93,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
 	uint64		bucket_accum;
 	int			i = 0;
+	Datum		getpage_wait_str;

 	metrics[i].name = "getpage_wait_seconds_count";
 	metrics[i].is_bucket = false;
@@ -137,7 +137,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 	metrics[i].is_bucket = false;
 	metrics[i].value = (double) counters->pageserver_requests_sent_total;
 	i++;
-	metrics[i].name = "pageserver_disconnects_total";
+	metrics[i].name = "pageserver_requests_disconnects_total";
 	metrics[i].is_bucket = false;
 	metrics[i].value = (double) counters->pageserver_disconnects_total;
 	i++;
@@ -192,7 +192,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
 	/* We put all the tuples into a tuplestore in one go. */
 	InitMaterializedSRF(fcinfo, 0);

-	for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
+	for (int procno = 0; procno < MaxBackends; procno++)
 	{
 		PGPROC	   *proc = GetPGProcByNumber(procno);
 		int			pid = proc->pid;
@@ -223,6 +223,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 	Datum		values[3];
 	bool		nulls[3];
+	Datum		getpage_wait_str;
 	neon_per_backend_counters totals = {0};
 	metric_t   *metrics;

@@ -230,7 +231,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	InitMaterializedSRF(fcinfo, 0);

 	/* Aggregate the counters across all backends */
-	for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
+	for (int procno = 0; procno < MaxBackends; procno++)
 	{
 		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];

--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -96,14 +96,6 @@ typedef struct
 /* Pointer to the shared memory array of neon_per_backend_counters structs */
 extern neon_per_backend_counters *neon_per_backend_counters_shared;

-/*
- * Size of the perf counters array in shared memory. One slot for each backend
- * and aux process. IOW one for each PGPROC slot, except for slots reserved
- * for prepared transactions, because they're not real processes and cannot do
- * I/O.
- */
-#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS)
-
 #if PG_VERSION_NUM >= 170000
 #define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
 #else
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -7,7 +7,6 @@
 #define NEON_PGVERSIONCOMPAT_H

 #include "fmgr.h"
-#include "storage/buf_internals.h"

 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
@@ -21,24 +20,11 @@
 	NInfoGetRelNumber(a) == NInfoGetRelNumber(b) \
 )

-/* These macros were turned into static inline functions in v16 */
+/* buftag population & RelFileNode/RelFileLocator rework */
 #if PG_MAJORVERSION_NUM < 16
-static inline bool
-BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
-{
-	return BUFFERTAGS_EQUAL(*tag1, *tag2);
-}

-static inline void
-InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
-			  ForkNumber forkNum, BlockNumber blockNum)
-{
-	INIT_BUFFERTAG(*tag, *rnode, forkNum, blockNum);
-}
-#endif
+#define InitBufferTag(tag, rfn, fn, bn) INIT_BUFFERTAG(*tag, *rfn, fn, bn)

-/* RelFileNode -> RelFileLocator rework */
-#if PG_MAJORVERSION_NUM < 16
 #define USE_RELFILENODE

 #define RELFILEINFO_HDR "storage/relfilenode.h"
@@ -87,6 +73,8 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #define USE_RELFILELOCATOR

+#define BUFFERTAGS_EQUAL(a, b) BufferTagsEqual(&(a), &(b))
+
 #define RELFILEINFO_HDR "storage/relfilelocator.h"

 #define NRelFileInfo RelFileLocator
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -213,6 +213,32 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);

+/* Neon storage manager functionality */
+
+extern void neon_init(void);
+extern void neon_open(SMgrRelation reln);
+extern void neon_close(SMgrRelation reln, ForkNumber forknum);
+extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool neon_exists(SMgrRelation reln, ForkNumber forknum);
+extern void neon_unlink(NRelFileInfoBackend rnode, ForkNumber forknum, bool isRedo);
+#if PG_MAJORVERSION_NUM < 16
+extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+#else
+extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, int nbuffers, bool skipFsync);
+#endif
+
+#if PG_MAJORVERSION_NUM >=17
+extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, int nblocks);
+#else
+extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum);
+#endif
+
 /*
 * LSN values associated with each request to the pageserver
 */
@@ -252,7 +278,13 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum,
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, void *buffer);
 #endif
+extern void neon_writeback(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
 extern int64 neon_dbsize(Oid dbNode);
+extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber nblocks);
+extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);

 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -118,8 +118,6 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

-static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
-
 /*
 * Prefetch implementation:
 *
@@ -217,7 +215,7 @@ typedef struct PrfHashEntry
 	sizeof(BufferTag) \
 )

-#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&(a)->buftag, &(b)->buftag))
+#define SH_EQUAL(tb, a, b)	(BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
 #define SH_SCOPE			static inline
 #define SH_DEFINE
 #define SH_DECLARE
@@ -738,7 +736,7 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
-	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
+	uint64		mySlotNo = slot->my_ring_index;

 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
@@ -805,19 +803,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 						  bool is_prefetch)
 {
 	uint64		min_ring_index;
-	PrefetchRequest hashkey;
+	PrefetchRequest req;
 #if USE_ASSERT_CHECKING
 	bool		any_hits = false;
 #endif
 	/* We will never read further ahead than our buffer can store. */
 	nblocks = Max(1, Min(nblocks, readahead_buffer_size));

-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	hashkey.buftag = tag;
+	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
+	req.buftag = tag;

 Retry:
 	min_ring_index = UINT64_MAX;
@@ -843,8 +837,8 @@ Retry:
 		slot = NULL;
 		entry = NULL;

-		hashkey.buftag.blockNum = tag.blockNum + i;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+		req.buftag.blockNum = tag.blockNum + i;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 		if (entry != NULL)
 		{
@@ -855,7 +849,7 @@ Retry:
 			Assert(slot->status != PRFS_UNUSED);
 			Assert(MyPState->ring_last <= ring_index &&
 				   ring_index < MyPState->ring_unused);
-			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+			Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag));

 			/*
 			 * If the caller specified a request LSN to use, only accept
@@ -892,19 +886,12 @@ Retry:
 				{
 					min_ring_index = Min(min_ring_index, ring_index);
 					/* The buffered request is good enough, return that index */
-					if (is_prefetch)
-						pgBufferUsage.prefetch.duplicates++;
-					else
-						pgBufferUsage.prefetch.hits++;
+					pgBufferUsage.prefetch.duplicates++;
 					continue;
 				}
 			}
 		}
-		else if (!is_prefetch)
-		{
-			pgBufferUsage.prefetch.misses += 1;
-			MyNeonCounters->getpage_prefetch_misses_total++;
-		}
+
 		/*
 		 * We can only leave the block above by finding that there's
 		 * no entry that can satisfy this request, either because there
@@ -987,7 +974,7 @@ Retry:
 		 * We must update the slot data before insertion, because the hash
 		 * function reads the buffer tag from the slot.
 		 */
-		slot->buftag = hashkey.buftag;
+		slot->buftag = req.buftag;
 		slot->shard_no = get_shard_number(&tag);
 		slot->my_ring_index = ring_index;

@@ -1465,6 +1452,7 @@ log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
 	BlockNumber	blknos[XLR_MAX_BLOCK_ID];
 	Page		pageptrs[XLR_MAX_BLOCK_ID];
 	int			nregistered = 0;
+	XLogRecPtr	result = 0;

 	for (int i = 0; i < nblocks; i++)
 	{
@@ -1777,7 +1765,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 /*
 *	neon_init() -- Initialize private state
 */
-static void
+void
 neon_init(void)
 {
 	Size		prfs_size;
@@ -1785,20 +1773,6 @@ neon_init(void)
 	if (MyPState != NULL)
 		return;

-	/*
-	 * Sanity check that theperf counters array is sized correctly. We got
-	 * this wrong once, and the formula for max number of backends and aux
-	 * processes might well change in the future, so better safe than sorry.
-	 * This is a very cheap check so we do it even without assertions.  On
-	 * v14, this gets called before initializing MyProc, so we cannot perform
-	 * the check here. That's OK, we don't expect the logic to change in old
-	 * releases.
-	 */
-#if PG_VERSION_NUM>=150000
-	if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
-		elog(ERROR, "MyNeonCounters points past end of array");
-#endif
-
 	prfs_size = offsetof(PrefetchState, prf_buffer) +
 		sizeof(PrefetchRequest) * readahead_buffer_size;

@@ -2167,7 +2141,7 @@ neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 /*
 *	neon_exists() -- Does the physical file exist?
 */
-static bool
+bool
 neon_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
@@ -2273,7 +2247,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 *
 * If isRedo is true, it's okay for the relation to exist already.
 */
-static void
+void
 neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
 	switch (reln->smgr_relpersistence)
@@ -2349,7 +2323,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 * Note: any failure should be reported as WARNING not ERROR, because
 * we are usually not in a transaction anymore when this is called.
 */
-static void
+void
 neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 {
 	/*
@@ -2373,7 +2347,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
 */
-static void
+void
 #if PG_MAJORVERSION_NUM < 16
 neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			char *buffer, bool skipFsync)
@@ -2465,7 +2439,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 }

 #if PG_MAJORVERSION_NUM >= 16
-static void
+void
 neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
@@ -2561,7 +2535,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 /*
 *  neon_open() -- Initialize newly-opened relation.
 */
-static void
+void
 neon_open(SMgrRelation reln)
 {
 	/*
@@ -2579,7 +2553,7 @@ neon_open(SMgrRelation reln)
 /*
 *	neon_close() -- Close the specified relation, if it isn't closed already.
 */
-static void
+void
 neon_close(SMgrRelation reln, ForkNumber forknum)
 {
 	/*
@@ -2594,12 +2568,13 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 /*
 *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
-static bool
+bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  int nblocks)
 {
 	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
+	bool		io_initiated = false;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2623,6 +2598,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	while (nblocks > 0)
 	{
 		int		iterblocks = Min(nblocks, PG_IOV_MAX);
+		int		seqlen = 0;
 		bits8		lfc_present[PG_IOV_MAX / 8];
 		memset(lfc_present, 0, sizeof(lfc_present));

@@ -2634,6 +2610,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			continue;
 		}

+		io_initiated = true;
+
 		tag.blockNum = blocknum;
 		
 		for (int i = 0; i < PG_IOV_MAX / 8; i++)
@@ -2656,7 +2634,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
 *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
-static bool
+bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
@@ -2700,7 +2678,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 * This accepts a range of blocks because flushing several pages at once is
 * considerably more efficient than doing so individually.
 */
-static void
+void
 neon_writeback(SMgrRelation reln, ForkNumber forknum,
 			   BlockNumber blocknum, BlockNumber nblocks)
 {
@@ -2750,19 +2728,14 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 	uint64		ring_index;
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
-	PrefetchRequest hashkey;
+	BufferTag	buftag = {0};

 	Assert(PointerIsValid(request_lsns));
 	Assert(nblocks >= 1);

-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
-	hashkey.buftag.forkNum = forkNum;
-	hashkey.buftag.blockNum = base_blockno;
+	CopyNRelFileInfoToBufTag(buftag, rinfo);
+	buftag.forkNum = forkNum;
+	buftag.blockNum = base_blockno;

 	/*
 	 * The redo process does not lock pages that it needs to replay but are
@@ -2780,7 +2753,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);

 	for (int i = 0; i < nblocks; i++)
 	{
@@ -2801,8 +2774,8 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		 * Try to find prefetched page in the list of received pages.
 		 */
 Retry:
-		hashkey.buftag.blockNum = blockno;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+		buftag.blockNum = blockno;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 		if (entry != NULL)
 		{
@@ -2810,6 +2783,7 @@ Retry:
 			if (neon_prefetch_response_usable(reqlsns, slot))
 			{
 				ring_index = slot->my_ring_index;
+				pgBufferUsage.prefetch.hits += 1;
 			}
 			else
 			{
@@ -2839,7 +2813,10 @@ Retry:
 		{
 			if (entry == NULL)
 			{
-				ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
+				pgBufferUsage.prefetch.misses += 1;
+				MyNeonCounters->getpage_prefetch_misses_total++;
+
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
 				Assert(ring_index != UINT64_MAX);
 				slot = GetPrfSlot(ring_index);
 			}
@@ -2864,8 +2841,8 @@ Retry:
 		} while (!prefetch_wait_for(ring_index));

 		Assert(slot->status == PRFS_RECEIVED);
-		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
-		Assert(hashkey.buftag.blockNum == base_blockno + i);
+		Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0);
+		Assert(buftag.blockNum == base_blockno + i);

 		resp = slot->response;

@@ -2921,10 +2898,10 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 *	neon_read() -- Read the specified block from a relation.
 */
 #if PG_MAJORVERSION_NUM < 16
-static void
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
-static void
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
@@ -3033,7 +3010,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif /* PG_MAJORVERSION_NUM <= 16 */

 #if PG_MAJORVERSION_NUM >= 17
-static void
+void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		void **buffers, BlockNumber nblocks)
 {
@@ -3068,9 +3045,6 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
 								  nblocks, read);

-	if (lfc_result > 0)
-		MyNeonCounters->file_cache_hits_total += lfc_result;
-
 	/* Read all blocks from LFC, so we're done */
 	if (lfc_result == nblocks)
 		return;
@@ -3197,7 +3171,6 @@ hexdump_page(char *page)
 }
 #endif

-#if PG_MAJORVERSION_NUM < 17
 /*
 *	neon_write() -- Write the supplied block at the appropriate location.
 *
@@ -3205,7 +3178,7 @@ hexdump_page(char *page)
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use mdextend().
 */
-static void
+void
 #if PG_MAJORVERSION_NUM < 16
 neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
 #else
@@ -3271,12 +3244,11 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 		#endif
 #endif
 }
-#endif



 #if PG_MAJORVERSION_NUM >= 17
-static void
+void
 neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 const void **buffers, BlockNumber nblocks, bool skipFsync)
 {
@@ -3326,7 +3298,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 /*
 *	neon_nblocks() -- Get the number of blocks stored in a relation.
 */
-static BlockNumber
+BlockNumber
 neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
@@ -3463,7 +3435,7 @@ neon_dbsize(Oid dbNode)
 /*
 *	neon_truncate() -- Truncate relation to specified number of blocks.
 */
-static void
+void
 neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr	lsn;
@@ -3532,7 +3504,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 * crash before the next checkpoint syncs the newly-inactive segment, that
 * segment may survive recovery, reintroducing unwanted data into the table.
 */
-static void
+void
 neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 	switch (reln->smgr_relpersistence)
@@ -3562,8 +3534,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 }

 #if PG_MAJORVERSION_NUM >= 17
-static void
-neon_registersync(SMgrRelation reln, ForkNumber forknum)
+void
+neon_regisersync(SMgrRelation reln, ForkNumber forknum)
 {
 	switch (reln->smgr_relpersistence)
 	{
@@ -3747,8 +3719,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	SlruKind	kind;
 	int			n_blocks;
 	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
-	NeonResponse *resp;
-	NeonGetSlruSegmentRequest request;

 	/*
 	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
@@ -3787,7 +3757,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	else
 		return -1;

-	request = (NeonGetSlruSegmentRequest) {
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
 		.req.lsn = request_lsn,
 		.req.not_modified_since = not_modified_since,
@@ -3894,7 +3865,7 @@ static const struct f_smgr neon_smgr =
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,
 #if PG_MAJORVERSION_NUM >= 17
-	.smgr_registersync = neon_registersync,
+	.smgr_registersync = neon_regisersync,
 #endif
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -213,7 +213,7 @@ WalProposerPoll(WalProposer *wp)
 		rc = wp->api.wait_event_set(wp, timeout, &sk, &events);

 		/* Exit loop if latch is set (we got new WAL) */
-		if (rc == 1 && (events & WL_LATCH_SET))
+		if ((rc == 1 && events & WL_LATCH_SET))
 			break;

 		/*
@@ -252,6 +252,8 @@ WalProposerPoll(WalProposer *wp)
 		/* timeout expired: poll state */
 		if (rc == 0 || TimeToReconnect(wp, now) <= 0)
 		{
+			TimestampTz now;
+
 			/*
 			 * If no WAL was generated during timeout (and we have already
 			 * collected the quorum), then send empty keepalive message
@@ -267,7 +269,8 @@ WalProposerPoll(WalProposer *wp)
 			now = wp->api.get_current_timestamp(wp);
 			for (int i = 0; i < wp->n_safekeepers; i++)
 			{
-				sk = &wp->safekeeper[i];
+				Safekeeper *sk = &wp->safekeeper[i];
+
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
@@ -1077,7 +1080,7 @@ SendProposerElected(Safekeeper *sk)
 	ProposerElected msg;
 	TermHistory *th;
 	term_t		lastCommonTerm;
-	int			idx;
+	int			i;

 	/* Now that we are ready to send it's a good moment to create WAL reader */
 	wp->api.wal_reader_allocate(sk);
@@ -1096,15 +1099,15 @@ SendProposerElected(Safekeeper *sk)
 	/* We must start somewhere. */
 	Assert(wp->propTermHistory.n_entries >= 1);

-	for (idx = 0; idx < Min(wp->propTermHistory.n_entries, th->n_entries); idx++)
+	for (i = 0; i < Min(wp->propTermHistory.n_entries, th->n_entries); i++)
 	{
-		if (wp->propTermHistory.entries[idx].term != th->entries[idx].term)
+		if (wp->propTermHistory.entries[i].term != th->entries[i].term)
 			break;
 		/* term must begin everywhere at the same point */
-		Assert(wp->propTermHistory.entries[idx].lsn == th->entries[idx].lsn);
+		Assert(wp->propTermHistory.entries[i].lsn == th->entries[i].lsn);
 	}
-	idx--;						/* step back to the last common term */
-	if (idx < 0)
+	i--;						/* step back to the last common term */
+	if (i < 0)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
@@ -1125,14 +1128,14 @@ SendProposerElected(Safekeeper *sk)
 		 * proposer, LSN it is currently writing, but then we just pick
 		 * safekeeper pos as it obviously can't be higher.
 		 */
-		if (wp->propTermHistory.entries[idx].term == wp->propTerm)
+		if (wp->propTermHistory.entries[i].term == wp->propTerm)
 		{
 			sk->startStreamingAt = sk->voteResponse.flushLsn;
 		}
 		else
 		{
-			XLogRecPtr	propEndLsn = wp->propTermHistory.entries[idx + 1].lsn;
-			XLogRecPtr	skEndLsn = (idx + 1 < th->n_entries ? th->entries[idx + 1].lsn : sk->voteResponse.flushLsn);
+			XLogRecPtr	propEndLsn = wp->propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr	skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn);

 			sk->startStreamingAt = Min(propEndLsn, skEndLsn);
 		}
@@ -1146,7 +1149,7 @@ SendProposerElected(Safekeeper *sk)
 	msg.termHistory = &wp->propTermHistory;
 	msg.timelineStartLsn = wp->timelineStartLsn;

-	lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
+	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
 	wp_log(LOG,
 		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
 		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
@@ -1638,7 +1641,7 @@ UpdateDonorShmem(WalProposer *wp)
 * Process AppendResponse message from safekeeper.
 */
 static void
-HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
+HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
 	XLogRecPtr	newCommitLsn;
@@ -1657,7 +1660,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
 	 * called to notify safekeepers about the new commitLsn.
 	 */
-	wp->api.process_safekeeper_feedback(wp, fromsk);
+	wp->api.process_safekeeper_feedback(wp, sk);

 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -725,7 +725,7 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);

-extern WalproposerShmemState *GetWalpropShmemState(void);
+extern WalproposerShmemState *GetWalpropShmemState();

 /*
 * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
@@ -745,7 +745,7 @@ extern TimeLineID walprop_pg_get_timeline_id(void);
 * catch logging.
 */
 #ifdef WALPROPOSER_LIB
-extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...) pg_attribute_printf(3, 4);
+extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
 #define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
 #else
 #define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -286,9 +286,6 @@ safekeepers_cmp(char *old, char *new)
 static void
 assign_neon_safekeepers(const char *newval, void *extra)
 {
-	char	   *newval_copy;
-	char	   *oldval;
-
 	if (!am_walproposer)
 		return;

@@ -298,8 +295,8 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	}

 	/* Copy values because we will modify them in split_safekeepers_list() */
-	newval_copy = pstrdup(newval);
-	oldval = pstrdup(wal_acceptors_list);
+	char *newval_copy = pstrdup(newval);
+	char *oldval = pstrdup(wal_acceptors_list);

 	/* 
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
@@ -425,9 +422,6 @@ backpressure_throttling_impl(void)
 	TimestampTz start,
 				stop;
 	bool		retry = false;
-	char	   *new_status = NULL;
-	const char *old_status;
-	int			len;

 	if (PointerIsValid(PrevProcessInterruptsCallback))
 		retry = PrevProcessInterruptsCallback();
@@ -448,24 +442,14 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;

-
-	old_status = get_ps_display(&len);
-	new_status = (char *) palloc(len + 64 + 1);
-	memcpy(new_status, old_status, len);
-	snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
-	set_ps_display(new_status);
-	new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
+	/* Suspend writers until replicas catch up */
+	set_ps_display("backpressure throttling");

 	elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 	start = GetCurrentTimestamp();
 	pg_usleep(BACK_PRESSURE_DELAY);
 	stop = GetCurrentTimestamp();
 	pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start);
-
-	/* Reset ps display */
-	set_ps_display(new_status);
-	pfree(new_status);
-
 	return true;
 }

@@ -541,7 +525,7 @@ nwp_shmem_startup_hook(void)
 }

 WalproposerShmemState *
-GetWalpropShmemState(void)
+GetWalpropShmemState()
 {
 	Assert(walprop_shared != NULL);
 	return walprop_shared;
@@ -1817,7 +1801,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	 * If wait is terminated by latch set (walsenders' latch is set on each
 	 * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
 	 */
-	if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger)
+	if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger)
 	{
 		/* Reset our latch */
 		ResetLatch(MyLatch);
@@ -1829,7 +1813,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	 * If the event contains something about the socket, it means we got an
 	 * event from a safekeeper socket.
 	 */
-	if (rc == 1 && (event.events & WL_SOCKET_MASK))
+	if (rc == 1 && (event.events & (WL_SOCKET_MASK)))
 	{
 		*sk = (Safekeeper *) event.user_data;
 		*events = event.events;
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -160,7 +160,7 @@ NeonWALPageRead(
 							  WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
 							  sock,
 							  timeout_ms,
-							  WAIT_EVENT_NEON_WAL_DL);
+							  WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
@@ -191,14 +191,13 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)

 	if (!wal_reader)
 	{
-		XLogRecPtr	basebackupLsn = GetRedoStartLsn();
+		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);

-		/* should never happen */
-		if (basebackupLsn == 0)
+		if (epochStartLsn == 0)
 		{
-			elog(ERROR, "unable to start walsender when basebackupLsn is 0");
+			elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
 		}
-		wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ");
+		wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
 	}
 	xlr->page_read = NeonWALPageRead;
 	xlr->segment_open = NeonWALReadSegmentOpen;
--- a/pgxn/neon_rmgr/neon_rmgr_desc.c
+++ b/pgxn/neon_rmgr/neon_rmgr_desc.c
@@ -44,6 +44,27 @@ infobits_desc(StringInfo buf, uint8 infobits, const char *keyname)
 	appendStringInfoString(buf, "]");
 }

+static void
+truncate_flags_desc(StringInfo buf, uint8 flags)
+{
+	appendStringInfoString(buf, "flags: [");
+
+	if (flags & XLH_TRUNCATE_CASCADE)
+		appendStringInfoString(buf, "CASCADE, ");
+	if (flags & XLH_TRUNCATE_RESTART_SEQS)
+		appendStringInfoString(buf, "RESTART_SEQS, ");
+
+	if (buf->data[buf->len - 1] == ' ')
+	{
+		/* Truncate-away final unneeded ", "  */
+		Assert(buf->data[buf->len - 2] == ',');
+		buf->len -= 2;
+		buf->data[buf->len] = '\0';
+	}
+
+	appendStringInfoString(buf, "]");
+}
+
 void
 neon_rm_desc(StringInfo buf, XLogReaderState *record)
 {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Yuchen Liang	e565c4fbe9	Merge branch 'yuchen/direct-io-aligned-alloc' into yuchen/direct-io-aligned-alloc-usage-wip	2024-10-07 13:08:55 -04:00
Yuchen Liang	a46757b769	add with_capacity_aligned_zeroed and leak Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-07 13:06:00 -04:00
Yuchen Liang	9c32bfee3b	fix put_io_mode to use the correct http endpoint Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 16:47:36 -04:00
Yuchen Liang	69ef8caf58	simplify virtual file wrapper Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:31:55 +00:00
Yuchen Liang	b7443dd643	add set_io_mode option to getpage_latest_lsn Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:16:42 +00:00
Yuchen Liang	cc433c76a3	fix clippy Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:05:01 +00:00
Yuchen Liang	2034ec906a	remove unused imports Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 12:04:11 +00:00
Yuchen Liang	f48ab8bcaa	use O_DIRECT as preferred Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 03:57:58 +00:00
Yuchen Liang	2607a57990	pageserver: add direct io config to virtual file Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-01 03:57:13 +00:00
Yuchen Liang	f04c1c230c	incr len in with_capacity_aligned_zeroed add a test Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 23:47:00 +00:00
Yuchen Liang	13f1931a09	fix build Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:18:29 +00:00
Yuchen Liang	e98a4eb5e2	add safety comments Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	e01d145066	remove example; add with_capacity_aligned_zeroed Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	9e9d76d6f2	use IoBufferMut for pagecache Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	14ec379d2b	enable O_DIRECT for delta and image layers Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	ebfe88a463	use IoBufferMut for delta and image layers Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-29 21:00:05 +00:00
Yuchen Liang	eb16aa9e81	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-29 16:59:27 -04:00
Yuchen Liang	f6d0ed6454	implement reserve for IoBufferMut Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-26 14:12:49 +00:00
Yuchen Liang	a2be8a440b	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-24 21:29:33 -04:00
Yuchen Liang	ff4a1db223	make sure we can Send IoBufferMut Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 23:52:34 -04:00
Yuchen Liang	29d54ccd20	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-09-03 11:41:49 -04:00
Yuchen Liang	68a1fe20f2	review: use doc comments to reference struct in safety comment Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-09-03 11:41:02 -04:00
Yuchen Liang	e8408c797a	remove unused comments Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 11:38:58 -04:00
Yuchen Liang	027f28deb9	remove Vec dependency Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-03 11:38:00 -04:00
Yuchen Liang	ea6f9798c6	add safety comment Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-16 18:06:16 +00:00
Yuchen Liang	253e4d5843	Merge branch 'main' into yuchen/direct-io-aligned-alloc	2024-08-16 13:14:20 -04:00
Yuchen Liang	852099bc83	remove aligned-vec, use ManuallyDrop<Vec<u8>> Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-16 17:13:30 +00:00
Yuchen Liang	148e230d11	add mut version marker trait Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-15 04:03:13 +00:00
Yuchen Liang	6d664788c1	feat(pageserver): newtype aligned-vec as aligned buffer allocation Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-15 03:43:58 +00:00