Proxy release 2025-05-12 14:35 UTC

chore(compute): skip audit logs for pg_session_jwt extension (#11883 )
references https://github.com/neondatabase/cloud/issues/28480#issuecomment-2866961124 related https://github.com/neondatabase/cloud/issues/28863 cc @MihaiBojin @conradludgate
2026-05-27 18:10:37 +00:00 · 2025-05-12 14:35:27 +00:00 · 2025-05-12 11:24:33 +00:00 · 2025-05-12 09:17:35 +00:00 · 2025-05-12 07:25:54 +00:00 · 2025-05-10 15:06:06 +00:00
37 changed files with 1153 additions and 398 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1303,6 +1303,7 @@ dependencies = [
 "futures",
 "http 1.1.0",
 "indexmap 2.0.1",
+ "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -28,6 +28,7 @@ flate2.workspace = true
 futures.workspace = true
 http.workspace = true
 indexmap.workspace = true
+itertools.workspace = true
 jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,12 +60,16 @@ use utils::failpoint_support;
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
+fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
+    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
+        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
+
+    Ok(if arg.starts_with("http") {
+        arg
    } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
+        FALLBACK_PG_EXT_GATEWAY_BASE_URL
    }
+    .to_owned())
 }

 #[derive(Parser)]
@@ -74,8 +78,10 @@ struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
-    pub remote_ext_config: Option<String>,
+    /// The base URL for the remote extension storage proxy gateway.
+    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
+    pub remote_ext_base_url: Option<String>,

    /// The port to bind the external listening HTTP server to. Clients running
    /// outside the compute will talk to the compute through this port. Keep
@@ -164,7 +170,7 @@ fn main() -> Result<()> {
            pgversion: get_pg_version_string(&cli.pgbin),
            external_http_port: cli.external_http_port,
            internal_http_port: cli.internal_http_port,
-            ext_remote_storage: cli.remote_ext_config.clone(),
+            remote_ext_base_url: cli.remote_ext_base_url.clone(),
            resize_swap_on_bind: cli.resize_swap_on_bind,
            set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
            #[cfg(target_os = "linux")]
@@ -265,4 +271,18 @@ mod test {
    fn verify_cli() {
        Cli::command().debug_assert()
    }
+
+    #[test]
+    fn parse_pg_ext_gateway_base_url() {
+        let arg = "http://pg-ext-s3-gateway2";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(result, arg);
+
+        let arg = "pg-ext-s3-gateway";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(
+            result,
+            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
+        );
+    }
 }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,6 +11,7 @@ use compute_api::spec::{
 use futures::StreamExt;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
+use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
@@ -18,7 +19,7 @@ use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
@@ -95,7 +96,7 @@ pub struct ComputeNodeParams {
    pub internal_http_port: u16,

    /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<String>,
+    pub remote_ext_base_url: Option<String>,
 }

 /// Compute node info shared across several `compute_ctl` threads.
@@ -1896,9 +1897,9 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let ext_remote_storage =
+        let remote_ext_base_url =
            self.params
-                .ext_remote_storage
+                .remote_ext_base_url
                .as_ref()
                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
                    "Remote extensions storage is not configured",
@@ -1960,7 +1961,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            ext_remote_storage,
+            remote_ext_base_url,
            &self.params.pgbin,
        )
        .await
@@ -1995,23 +1996,40 @@ LIMIT 100",
        tokio::spawn(conn);

        // TODO: support other types of grants apart from schemas?
-        let query = format!(
-            "GRANT {} ON SCHEMA {} TO {}",
-            privileges
-                .iter()
-                // should not be quoted as it's part of the command.
-                // is already sanitized so it's ok
-                .map(|p| p.as_str())
-                .collect::<Vec<&'static str>>()
-                .join(", "),
-            // quote the schema and role name as identifiers to sanitize them.
-            schema_name.pg_quote(),
-            role_name.pg_quote(),
-        );
-        db_client
-            .simple_query(&query)
+
+        // check the role grants first - to gracefully handle read-replicas.
+        let select = "SELECT privilege_type
+            FROM pg_namespace
+                JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
+                JOIN pg_user users ON acl.grantee = users.usesysid
+            WHERE users.usename = $1
+                AND nspname = $2";
+        let rows = db_client
+            .query(select, &[role_name, schema_name])
            .await
-            .with_context(|| format!("Failed to execute query: {}", query))?;
+            .with_context(|| format!("Failed to execute query: {select}"))?;
+
+        let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
+
+        let grants = privileges
+            .iter()
+            .filter(|p| !already_granted.contains(p.as_str()))
+            // should not be quoted as it's part of the command.
+            // is already sanitized so it's ok
+            .map(|p| p.as_str())
+            .join(", ");
+
+        if !grants.is_empty() {
+            // quote the schema and role name as identifiers to sanitize them.
+            let schema_name = schema_name.pg_quote();
+            let role_name = role_name.pg_quote();
+
+            let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }

        Ok(())
    }
@@ -2069,7 +2087,7 @@ LIMIT 100",
        &self,
        spec: &ComputeSpec,
    ) -> Result<RemoteExtensionMetrics> {
-        if self.params.ext_remote_storage.is_none() {
+        if self.params.remote_ext_base_url.is_none() {
            return Ok(RemoteExtensionMetrics {
                num_ext_downloaded: 0,
                largest_ext_size: 0,
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -224,7 +224,10 @@ pub fn write_postgres_conf(
            writeln!(file, "pgaudit.log_rotation_age=5")?;

            // Enable audit logs for pg_session_jwt extension
-            writeln!(file, "pg_session_jwt.audit_log=on")?;
+            // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as
+            // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863
+            //
+            // writeln!(file, "pg_session_jwt.audit_log=on")?;

            // Add audit shared_preload_libraries, if they are not present.
            //
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    ext_remote_storage: &str,
+    remote_ext_base_url: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);

    // TODO add retry logic
    let download_buffer =
-        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
            Ok(buffer) => buffer,
            Err(error_message) => {
                return Err(anyhow::anyhow!(
@@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 // Do request to extension storage proxy, e.g.,
 // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", ext_remote_storage, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
    let filename = Path::new(ext_path)
        .file_name()
        .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension(
    State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
    // Don't even try to download extensions if no remote storage is configured
-    if compute.params.ext_remote_storage.is_none() {
+    if compute.params.remote_ext_base_url.is_none() {
        return JsonResponse::error(
            StatusCode::PRECONDITION_FAILED,
            "remote storage is not configured",
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -644,9 +644,10 @@ struct EndpointStartCmdArgs {

    #[clap(
        long,
-        help = "Configure the remote extensions storage proxy gateway to request for extensions."
+        help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
+        alias = "remote-ext-config"
    )]
-    remote_ext_config: Option<String>,
+    remote_ext_base_url: Option<String>,

    #[clap(
        long,
@@ -1414,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
        EndpointCmd::Start(args) => {
            let endpoint_id = &args.endpoint_id;
            let pageserver_id = args.endpoint_pageserver_id;
-            let remote_ext_config = &args.remote_ext_config;
+            let remote_ext_base_url = &args.remote_ext_base_url;

            let default_generation = env
                .storage_controller
@@ -1517,7 +1518,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    safekeepers_generation,
                    safekeepers,
                    pageservers,
-                    remote_ext_config.as_ref(),
+                    remote_ext_base_url.as_ref(),
                    stripe_size.0 as usize,
                    args.create_test_user,
                    args.start_timeout,
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -655,7 +655,7 @@ impl Endpoint {
        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
        pageservers: Vec<(Host, u16)>,
-        remote_ext_config: Option<&String>,
+        remote_ext_base_url: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
        start_timeout: Duration,
@@ -825,8 +825,8 @@ impl Endpoint {
        .stderr(logfile.try_clone()?)
        .stdout(logfile);

-        if let Some(remote_ext_config) = remote_ext_config {
-            cmd.args(["--remote-ext-config", remote_ext_config]);
+        if let Some(remote_ext_base_url) = remote_ext_base_url {
+            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
        }

        let child = cmd.spawn()?;
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898'
 remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
 control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
 control_plane_emergency_mode=true
+virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -7,6 +7,8 @@ Author: Christian Schwarz

 A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.

+**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link).
+
 # Motivation

 During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
--- a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md
+++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md
@@ -0,0 +1,362 @@
+# Direct IO For Pageserver
+
+Date: Apr 30, 2025
+
+## Summary
+
+This document is a retroactive RFC. It
+- provides some background on what direct IO is,
+- motivates why Pageserver should be using it for its IO, and
+- describes how we changed Pageserver to use it.
+
+The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR.
+
+People primarily involved in this project were:
+- Yuchen Liang <yuchen@neon.tech>
+- Vlad Lazar <vlad@neon.tech>
+- Christian Schwarz <christian@neon.tech>
+
+## Timeline
+
+For posterity, here is the rough timeline of the development work that got us to where we are today.
+
+- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API
+- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode
+- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks
+  - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests
+  - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users
+- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go.
+- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376))
+- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO
+- Apr 2025: develop & roll out direct IO for the write path
+
+## Background: Terminology & Glossary
+
+**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents.
+The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k).
+The cache lives in kernel memory and is not directly accessible through userspace.
+
+**Buffered IO**: an application's read/write system calls go through the kernel page cache.
+For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents
+at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict
+a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes
+from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps
+track of the fact that the page is now "dirty" in some ancillary structure.
+
+**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications
+made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel
+asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant
+ones are a) explicit request by userspace (`fsync`) and b) memory pressure.
+
+**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity.
+If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations.
+Before reusing a page like that, the page has to be written back (writeback, see above).
+The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only
+way to get that memory is by eviction & re-using a dirty page cache page.
+Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`.
+I refer to this effect as the "malloc latency backscatter" caused by buffered IO.
+
+**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem
+is still involved because it is ultimately in charge of mapping the concept of files & offsets within them
+to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers
+and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155).
+The IO operations will fail at runtime with EINVAL if the alignment requirements are not met.
+
+**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and
+fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers,
+kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by
+the application.
+It takes more effort by the application to program with direct instead of buffered IO.
+The return is precise control over and a clear distinction between consumption/modification of memory vs disk.
+
+**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache").
+Its caching unit is 8KiB blocks of the layer files written by Pageserver.
+A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer.
+The default size is tiny (64MiB), very much like Postgres's `shared_buffers`.
+We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year.
+
+**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name.
+Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux.
+However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of
+IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`).
+
+## Background: History Of Caching In Pageserver
+
+For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO.
+It performed write-back to the kernel using buffered IO.
+
+We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994).
+
+The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers.
+The `PageCache` pages are usable as owned IO buffers.
+
+We then started bypassing PageCache for user data blocks.
+Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets.
+The disk btree embedded in delta & image layers remains `PageCache`'d.
+Epics for that work were:
+- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright.
+- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks:
+  - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice)
+  - InMemoryLayer
+  - Compaction
+
+The outcome of the above:
+1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache).
+2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`.
+
+In production we size the PS `PageCache` to be 2GiB.
+Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines.
+High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS).
+The response to this is to migrate tenants away, or increase PS `PageCache` size.
+It is currently manual but could be automated, e.g., in Storage Controller.
+
+In the future, we may eliminate the `PageCache` even for indirect blocks.
+For example with an LRU cache that has as unit the entire disk btree content
+instead of individual blocks.
+
+## High-Level Design
+
+So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache.
+We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem.
+This achieves the following system properties:
+
+**Predictable VirtualFile latencies**
+* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss.
+* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure.
+* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe.
+  But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree.
+* By switching to direct IO, above operations will have the (predictable) device latency -- always.
+  Reads and appends always go to disk.
+  And malloc will not have to write back dirty data.
+
+**Explicitness & Tangibility of resource usage**
+* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant.
+* By using direct IO, we become explicit about the resources *disk IOPs*  and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control.
+* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?").
+* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that.
+
+**CPU Efficiency**
+* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path.
+* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements.
+
+The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are:
+- read latency improvements for repeat reads of the same data ("locality of reference")
+  - asterisk: only if that state is still cache-resident by time of next access
+- write throughput by having kernel page cache batch small VFS writes into bigger disk writes
+  - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback
+
+We are **happy to make this trade-off**:
+- Because of the advantages listed above.
+- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache.
+  (At just 2GiB PS PageCache size, we average a 99.95% hit rate).
+  So, the latency of going to disk is only for data block reads, not the index traversal.
+- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance).
+  And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it.
+  (See the appendix for a more detailed explanation why this is).
+- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before.
+
+### Desired End State
+
+The desired end state of the project is as follows, and with some asterisks, we have achieved it.
+
+All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache.
+
+In particular, the "data path" includes
+- the wal ingest path
+- compaction
+- anything on the `Timeline::get` / `Timeline::get_vectored` path.
+
+The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache.
+Hit rate target is 99.95%.
+
+There are no regressions to ingest latency.
+
+The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`.
+We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO.
+Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO).
+
+The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request.
+We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call.
+(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth).
+
+## Design & Implementation
+
+### Prerequisites
+
+A lot of prerequisite work had to happen to enable use of direct IO.
+
+To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path:
+- page_service level server-side batching (config field `page_service_pipelining`)
+- concurrent IO (config field `get_vectored_concurrent_io`)
+The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376).
+Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799).
+The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`.
+The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC.
+
+For the write path, and especially WAL ingest, we need to hide write latency.
+We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled
+buffer happen in a sidecar tokio task while new writes fill a new buffer.
+We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`.
+The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558).
+
+### Ensuring Adherence to Alignment Requirements
+
+Direct IO puts requirements on
+- memory buffer alignment
+- io size (=memory buffer size)
+- file offset alignment
+
+The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!).
+
+In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe).
+Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple.
+We made this decision because:
+- a) it is compatible with all the environments we need to run in
+- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart)
+- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower).
+- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO.
+
+This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD).
+
+The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements.
+All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits.
+Implementors of the marker traits are:
+- `IoBuffer` / `IoBufferMut`: used for most reads and writes
+- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!)
+
+The alignment requirement is infectious; it permeates bottom-up throughout the code base.
+We stop the infection at roughly the same layers in the code base where we stopped permeating the
+use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing
+a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap.
+The places where we currently stop permeating are sort of arbitrary. For example, it would probably
+make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s.
+
+The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors:
+- non-adherence to file offset alignment requirements
+- non-adherence to io size requirements
+
+The following higher-level constructs ensure we meet the requirements:
+- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples.
+- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment.
+
+Note that these types are used always, regardless of whether direct IO is enabled or not.
+There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512).
+But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO.
+
+### Configuration / Feature Flagging
+
+In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements.
+To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations.
+
+We set `O_DIRECT` based on:
+- the VirtualFile API used to create/open the VirtualFile instance
+- the `virtual_file_io_mode` configuration flag
+- the OpenOptions `read` and/or `write` flags.
+
+The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list.
+Other APIs never use `O_DIRECT`.
+(The name is bad and should really be `_maybe_direct_io`.)
+
+The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path).
+At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available.
+
+The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags.
+The result is the following runtime behavior:
+
+|what|OpenOptions|`v_f_io_mode`<br/>=`buffered`|`v_f_io_mode`<br/>=`direct`|`v_f_io_mode`<br/>=`direct-rw`|
+|-|-|-|-|-|
+|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT|
+|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT|
+|`InMemoryLayer`|read + write|()|()*|O_DIRECT|
+|`DeltaLayerWriter`| write | () | () |  O_DIRECT |
+|`ImageLayerWriter`| write | () | () |  O_DIRECT |
+|`download_layer_file`|write |()|()|O_DIRECT|
+
+The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`.
+That period was when we implemented and shipped the first version of `BufferedWriter`.
+We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`.
+The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later,
+in https://github.com/neondatabase/neon/pull/11558.
+
+Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction.
+For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set.
+
+## Correctness Validation
+
+The correctness risks with this project were:
+- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation.
+  These types expose an API that is largely identical to that of the `bytes` crate and/or Vec.
+- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path.
+
+We sadly do not have infrastructure to run pageserver under `cargo miri`.
+So for memory safety issues, we relied on careful peer review.
+
+We do assert the production-like alignment requirements in testing builds.
+However, these asserts were added retroactively.
+The actual validation before rollout happened in staging and pre-prod.
+We eventually enabled  `=direct`/`=direct-rw` for Rust unit tests and the regression test suite.
+I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements.
+Evidently developer testing was good enough.
+
+## Performance Validation
+
+The read path went through a lot of iterations of benchmarking in staging and pre-prod.
+The benchmarks in those environments demonstrated performance regressions early in the implementation.
+It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions.
+
+The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns.
+
+## Future Work
+
+There is minor and major follow-up work that can be considered in the future.
+Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list.
+
+Read Path:
+- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally.
+  Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size
+  and potentially also use that to drive placement decisions of shards from StorageController
+  https://github.com/neondatabase/neon/issues/9288
+- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache.
+  But even then, an estimation of the working set would be helpful to figure out caching strategy.
+
+Write Path:
+- BlobWriter and its users could switch back to a borrowed API  https://github.com/neondatabase/neon/issues/10129
+- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101
+- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692
+- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676
+
+Both:
+- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster.
+  This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts.
+  However, padding latencies at microsecond scale is non-trivial.
+
+Misc:
+- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write.
+  Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use
+  APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string`
+  are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809
+
+# Appendix
+
+## Why Kernel Page Cache Is Ineffective At Tenant High Density
+
+In the Motivation section, we stated:
+
+> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance).
+
+The reason is that the  Pageserver workload sent from Computes is whatever is a Compute cache(s) miss.
+That's either sequential scans or random reads.
+A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available.
+It is complete waste to have the kernel page cache cache data blocks in this case.
+Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space.
+In such cases, the WAL records of those updates likely sit on the same delta layer block.
+When Compute does a sequential scan, it sends a series of single-page requests for these individual pages.
+When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit.
+This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching.
+We can either add a small per-connection LRU cache for such delta layer blocks.
+Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice.
+This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32).
+
+There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these
+1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation)
+2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching).
--- a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
+++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
@@ -0,0 +1,251 @@
+# Concurrent IO for Pageserver Read Path
+
+Date: May 6, 2025
+
+## Summary
+
+This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025.
+
+The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files
+_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete.
+
+Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time
+contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`.
+
+The motivation for why this work had to happen when it happened was the switch of Pageserver to
+- not cache user data blocks in PS PageCache and
+- switch to use direct IO.
+More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`.
+
+### Refs
+
+- Epic: https://github.com/neondatabase/neon/issues/9378
+- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002
+- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378
+
+Design and implementation by:
+- Vlad Lazar <vlad@neon.tech>
+- Christian Schwarz <christian@neon.tech>
+
+## Background & Motivation
+
+The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps:
+- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`).
+- Pass these values to walredo to reconstruct the page images.
+
+The read path used to be single-key but has been made multi-key some time ago.
+([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link))
+However, for simplicity, most of this doc will explain things in terms of a single key being requested.
+
+The `Value` retrieval step above can be broken down into the following functions:
+- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction.
+- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk.
+  The main job here is to coalesce the small value reads into larger filesystem-level read operations.
+  This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.)
+  Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done.
+- **Perform the read IO** using `tokio-epoll-uring`.
+
+Before this project, above functions were sequentially interleaved, meaning:
+1. we would advance traversal, ...
+2. discover, that we need to read a value, ...
+3. read it from disk using `tokio-epoll-uring`, ...
+4. goto 1 unless we're done.
+
+This meant that if N `Value`s need to be read to reconstruct a page,
+the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`.
+
+## Design
+
+The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before.
+But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution.
+After the last read from the last layer is submitted, we wait for the IOs to complete.
+
+Assuming the filesystem / disk is able to actually process the submitted IOs without queuing,
+we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`.
+
+Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe.
+Traversal will stall on on-demand layer download if a layer is not yet resident.
+It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index.
+
+### Avoiding Waiting For IO During Traversal
+
+The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized.
+
+Before this project, traversal needed to perform IOs for the following:
+1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks.
+2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key,
+   to determine whether the `Value::will_init` the page and therefore traversal can stop for this key.
+
+The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%.
+(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.)
+
+The solution for (2) is source `will_init` from the disk btree index keys, which fortunately
+already encode this bit of information since the introduction of the current storage/layer format.
+
+### Concurrent IOs, Submission & Completion
+
+To separate IO submission from waiting for its completion,
+we introduce the notion of an `IoConcurrency` struct through which IOs are issued.
+
+An IO is an opaque future that
+- captures the `tx` side of a `oneshot` channel
+- performs the read IO by calling `VirtualFile::read_exact_at().await`
+- sending the result into the `tx`
+
+Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct.
+
+The traversal code that submits the IO stores the the corresponding `oneshot::Receiver`
+in the `VectoredValueReconstructState`, in the the place where we previously stored
+the sequentially read `img` and `records` fields.
+
+When we're done with traversal, we wait for all submitted IOs:
+for each key, there is a future that awaits all the `oneshot::Receiver`s
+for that key, and then calls into walredo to reconstruct the page image.
+Walredo is now invoked concurrently for each value instead of sequentially.
+Walredo itself remains unchanged.
+
+The spawned IO futures are driven to completion by a sidecar tokio task that
+is separate from the task that performs all the layer visiting and spawning of IOs.
+That tasks receives the IO futures via an unbounded mpsc channel and
+drives them to completion inside a `FuturedUnordered`.
+
+### Error handling, Panics, Cancellation-Safety
+
+There are two error classes during reconstruct data retrieval:
+* traversal errors: index lookup, move to next layer, and the like
+* value read IO errors
+
+A traversal error fails the entire `get_vectored` request, as before this PR.
+A value read error only fails reconstruction of that value.
+
+Panics and dropping of the `get_vectored` future before it completes
+leaves the sidecar task running and does not cancel submitted IOs
+(see next section for details on sidecar task lifecycle).
+All of this is safe, but, today's preference in the team is to close out
+all resource usage explicitly if possible, rather than cancelling + forgetting
+about it on drop. So, there is warning if we drop a
+`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs.
+
+### Sidecar Task Lifecycle
+
+The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct.
+The `IoConcurrency` object acts as a handle through which IO futures are submitted.
+
+The spawned tokio task holds the `Timeline::gate` open.
+It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped.
+
+Once the `IoConcurrency` struct is dropped, no new IO futures can come in
+but already submitted IO futures will be driven to completion regardless.
+We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe.
+But the underlying kernel and hardware resources are not magically freed up by that.
+So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete.
+Under normal conditions, this should be in the low hundreds of microseconds.
+
+It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of
+tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack.
+The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to
+the (short-lived) functions/scope where we issue the IOs.
+We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)).
+For now, we just add another argument to the relevant code paths.
+
+### Feature Gating
+
+The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`.
+
+The behavior from before this project is available through `IoConcurrency::Sequential`,
+which awaits the IO futures in place, without "spawning" or "submitting" them anywhere.
+
+The `get_vectored_concurrent_io` pageserver config variable determines the runtime value,
+**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object.
+
+### Alternatives Explored & Caveats Encountered
+
+A few words on the rationale behind having a sidecar *task* and what
+alternatives were considered but abandoned.
+
+#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work
+
+We explored to not have a sidecar task, and instead have a `FuturesUnordered` per
+`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the
+first time after traversal is complete (i.e., at `collect_pending_ios`).
+
+The obvious disadvantage, but not showstopper, is that we wouldn't be submitting
+IOs until traversal is complete.
+
+The showstopper however, is that deadlocks happen if we don't drive the
+IO futures to completion independently of the traversal task.
+The reason is that both the IO futures and the traversal task may hold _some_,
+_and_ try to acquire _more_, shared limited resources.
+For example, both the travseral task and IO future may try to acquire
+* a `VirtualFile` file descriptor cache slot async mutex (observed during impl)
+* a `tokio-epoll-uring` submission slot (observed during impl)
+* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future)
+
+#### Why We Don't Do `tokio::task`-per-IO-future
+
+Another option is to spawn a short-lived `tokio::task` for each IO future.
+We implemented and benchmarked it during development, but found little
+throughput improvement and moderate mean & tail latency degradation.
+Concerns about pressure on the tokio scheduler led us to abandon this variant.
+
+## Future Work
+
+In addition to what is listed here, also check the "Punted" list in the epic:
+https://github.com/neondatabase/neon/issues/9378
+
+### Enable `Timeline::get`
+
+The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`.
+The impact is that roughly the following parts of pageserver do not benefit yet:
+- parts of basebackup
+- reads performed by the ingest path
+- most internal operations that read metadata keys (e.g. `collect_keyspace`!)
+
+The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460
+
+The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext).
+
+Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given
+piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the
+place that puts the `IoConcurrency` into the `RequestContext`.
+We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some
+observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`.
+
+### Concurrent On-Demand Downloads enabled by Detached Indices
+
+As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index.
+Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695)
+we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example:
+- Move the `Layer::get_or_maybe_download().await` inside the IO futures.
+  This goes in the opposite direction of the next "future work" item below, but it's easy to do.
+- Serve the IO future directly from object storage and dispatch the layer download
+  to some other actor, e.g., an actor that is responsible for both downloads & eviction.
+
+### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion
+
+Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API
+that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission,
+and then wait for completion.
+
+The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`.
+
+A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full).
+While avoiding spending of CPU cycles on processing of completions while we're still traversing.
+
+The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing.
+So, the submission part of the split API needs to process completions if squeue is full.
+
+In any way, this split API is precondition for the bigger issue with the design presented here,
+which we dicsuss in the next section.
+
+### Opaque Futures Are Brittle
+
+The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating.
+However, we take on **brittleness** because callers must guarantee that the submitted futures are independent.
+By our experience, it is non-trivial to identify or rule out the interdependencies.
+See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details.
+
+The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer")
+and get back a means to wait for completion.
+The subsystem can thereby reason by its own how operations may be related;
+unlike today, where the submitted opaque future can do just about anything.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1832,6 +1832,7 @@ pub mod virtual_file {
        Eq,
        Hash,
        strum_macros::EnumString,
+        strum_macros::EnumIter,
        strum_macros::Display,
        serde_with::DeserializeFromStr,
        serde_with::SerializeDisplay,
@@ -1843,10 +1844,8 @@ pub mod virtual_file {
        /// Uses buffered IO.
        Buffered,
        /// Uses direct IO for reads only.
-        #[cfg(target_os = "linux")]
        Direct,
        /// Use direct IO for reads and writes.
-        #[cfg(target_os = "linux")]
        DirectRw,
    }

@@ -1854,26 +1853,13 @@ pub mod virtual_file {
        pub fn preferred() -> Self {
            // The default behavior when running Rust unit tests without any further
            // flags is to use the newest behavior (DirectRw).
-            // The CI uses the following environment variable to unit tests for all
-            // different modes.
+            // The CI uses the environment variable to unit tests for all different modes.
            // NB: the Python regression & perf tests have their own defaults management
            // that writes pageserver.toml; they do not use this variable.
-            if cfg!(test) {
-                static CACHED: LazyLock<IoMode> = LazyLock::new(|| {
-                    utils::env::var_serde_json_string(
-                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
-                    )
-                    .unwrap_or(
-                        #[cfg(target_os = "linux")]
-                        IoMode::DirectRw,
-                        #[cfg(not(target_os = "linux"))]
-                        IoMode::Buffered,
-                    )
-                });
-                *CACHED
-            } else {
-                IoMode::Buffered
-            }
+            static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
+                utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
+            });
+            ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
        }
    }

@@ -1883,9 +1869,7 @@ pub mod virtual_file {
        fn try_from(value: u8) -> Result<Self, Self::Error> {
            Ok(match value {
                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
-                #[cfg(target_os = "linux")]
                v if v == (IoMode::Direct as u8) => IoMode::Direct,
-                #[cfg(target_os = "linux")]
                v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw,
                x => return Err(x),
            })
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -299,6 +299,7 @@ pub struct PullTimelineRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub http_hosts: Vec<String>,
+    pub ignore_tombstone: Option<bool>,
 }

 #[derive(Debug, Serialize, Deserialize)]
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -14,6 +14,7 @@ use pageserver_api::key::Key;
 use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    ];
    let exploded_parameters = {
        let mut out = Vec::new();
-        for io_mode in [
-            IoMode::Buffered,
-            #[cfg(target_os = "linux")]
-            IoMode::Direct,
-            #[cfg(target_os = "linux")]
-            IoMode::DirectRw,
-        ] {
+        for io_mode in IoMode::iter() {
            for param in expect.clone() {
                let HandPickedParameters {
                    volume_mib,
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1277,6 +1277,8 @@ impl Timeline {
            return Ok(CompactionOutcome::YieldForL0);
        }

+        let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
+
        // 2. Repartition and create image layers if necessary
        match self
            .repartition(
@@ -1287,7 +1289,7 @@ impl Timeline {
            )
            .await
        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::from(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -1341,6 +1343,10 @@ impl Timeline {
                }
            }

+            Ok(_) => {
+                info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
+            }
+
            // Suppress errors when cancelled.
            Err(_) if self.cancel.is_cancelled() => {}
            Err(err) if err.is_cancel() => {}
@@ -3606,6 +3612,13 @@ impl Timeline {
                    last_key = Some(key);
                }
                accumulated_values.push((key, lsn, val));
+
+                if accumulated_values.len() >= 65536 {
+                    // Assume all of them are images, that would be 512MB of data in memory for a single key.
+                    return Err(CompactionError::Other(anyhow!(
+                        "too many values for a single key, giving up gc-compaction"
+                    )));
+                }
            } else {
                let last_key: &mut Key = last_key.as_mut().unwrap();
                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -74,6 +74,8 @@ pub struct VirtualFile {

 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
+    ///
+    /// Insensitive to `virtual_file_io_mode` setting.
    pub async fn open<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
@@ -95,31 +97,20 @@ impl VirtualFile {
        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
        path: P,
-        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions,
+        mut open_options: OpenOptions,
        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let mode = get_io_mode();
-        let set_o_direct = match (mode, open_options.is_write()) {
+        let direct = match (mode, open_options.is_write()) {
            (IoMode::Buffered, _) => false,
-            #[cfg(target_os = "linux")]
            (IoMode::Direct, false) => true,
-            #[cfg(target_os = "linux")]
            (IoMode::Direct, true) => false,
-            #[cfg(target_os = "linux")]
            (IoMode::DirectRw, _) => true,
        };
-        if set_o_direct {
-            #[cfg(target_os = "linux")]
-            {
-                open_options = open_options.custom_flags(nix::libc::O_DIRECT);
-            }
-            #[cfg(not(target_os = "linux"))]
-            unreachable!(
-                "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined"
-            );
-        }
+        open_options = open_options.direct(direct);
        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
        Ok(VirtualFile { inner, _mode: mode })
    }
@@ -791,6 +782,12 @@ impl VirtualFileInner {
    where
        Buf: tokio_epoll_uring::IoBufMut + Send,
    {
+        self.validate_direct_io(
+            Slice::stable_ptr(&buf).addr(),
+            Slice::bytes_total(&buf),
+            offset,
+        );
+
        let file_guard = match self
            .lock_file()
            .await
@@ -816,6 +813,8 @@ impl VirtualFileInner {
        offset: u64,
        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
+        self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset);
+
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
            Err(e) => return (buf, Err(e)),
@@ -830,6 +829,64 @@ impl VirtualFileInner {
            (buf, result)
        })
    }
+
+    /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems.
+    ///
+    /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use.
+    fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) {
+        // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod.
+        if !(cfg!(feature = "testing") || cfg!(test)) {
+            return;
+        }
+        if !self.open_options.is_direct() {
+            return;
+        }
+
+        // Validate buffer memory alignment.
+        //
+        // What practically matters as of Linux 6.1 is bdev_dma_alignment()
+        // which is practically between 512 and 4096.
+        // On our production systems, the value is 512.
+        // The IoBuffer/IoBufferMut hard-code that value.
+        //
+        // Because the alloctor might return _more_ aligned addresses than requested,
+        // there is a chance that testing would not catch violations of a runtime requirement stricter than 512.
+        {
+            let requirement = 512;
+            let remainder = addr % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+
+        // Validate offset alignment.
+        //
+        // We hard-code 512 throughout the code base.
+        // So enforce just that and not anything more restrictive.
+        // Even the shallowest testing will expose more restrictive requirements if those ever arise.
+        {
+            let requirement = 512;
+            let remainder = offset % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+
+        // Validate buffer size multiple requirement.
+        //
+        // The requirement in Linux 6.1 is bdev_logical_block_size().
+        // On our production systems, that is 512.
+        {
+            let requirement = 512;
+            let remainder = size % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+    }
 }

 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1218,7 +1275,6 @@ mod tests {
    use std::sync::Arc;

    use owned_buffers_io::io_buf_ext::IoBufExt;
-    use owned_buffers_io::slice::SliceMutExt;
    use rand::seq::SliceRandom;
    use rand::{Rng, thread_rng};

@@ -1226,162 +1282,38 @@ mod tests {
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;

-    enum MaybeVirtualFile {
-        VirtualFile(VirtualFile),
-        File(File),
-    }
-
-    impl From<VirtualFile> for MaybeVirtualFile {
-        fn from(vf: VirtualFile) -> Self {
-            MaybeVirtualFile::VirtualFile(vf)
-        }
-    }
-
-    impl MaybeVirtualFile {
-        async fn read_exact_at(
-            &self,
-            mut slice: tokio_epoll_uring::Slice<IoBufferMut>,
-            offset: u64,
-            ctx: &RequestContext,
-        ) -> Result<tokio_epoll_uring::Slice<IoBufferMut>, Error> {
-            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
-                MaybeVirtualFile::File(file) => {
-                    let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
-                    file.read_exact_at(rust_slice, offset).map(|()| slice)
-                }
-            }
-        }
-        async fn write_all_at<Buf: IoBufAligned + Send>(
-            &self,
-            buf: FullSlice<Buf>,
-            offset: u64,
-            ctx: &RequestContext,
-        ) -> Result<(), Error> {
-            match self {
-                MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
-                    res
-                }
-                MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
-            }
-        }
-
-        // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(
-            &mut self,
-            pos: u64,
-            len: usize,
-            ctx: &RequestContext,
-        ) -> Result<String, Error> {
-            let slice = IoBufferMut::with_capacity(len).slice_full();
-            assert_eq!(slice.bytes_total(), len);
-            let slice = self.read_exact_at(slice, pos, ctx).await?;
-            let buf = slice.into_inner();
-            assert_eq!(buf.len(), len);
-
-            Ok(String::from_utf8(buf.to_vec()).unwrap())
-        }
-    }
-
    #[tokio::test]
    async fn test_virtual_files() -> anyhow::Result<()> {
-        // The real work is done in the test_files() helper function. This
-        // allows us to run the same set of tests against a native File, and
-        // VirtualFile. We trust the native Files and wouldn't need to test them,
-        // but this allows us to verify that the operations return the same
-        // results with VirtualFiles as with native Files. (Except that with
-        // native files, you will run out of file descriptors if the ulimit
-        // is low enough.)
-        struct A;
-
-        impl Adapter for A {
-            async fn open(
-                path: Utf8PathBuf,
-                opts: OpenOptions,
-                ctx: &RequestContext,
-            ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?;
-                Ok(MaybeVirtualFile::VirtualFile(vf))
-            }
-        }
-        test_files::<A>("virtual_files").await
-    }
-
-    #[tokio::test]
-    async fn test_physical_files() -> anyhow::Result<()> {
-        struct B;
-
-        impl Adapter for B {
-            async fn open(
-                path: Utf8PathBuf,
-                opts: OpenOptions,
-                _ctx: &RequestContext,
-            ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                Ok(MaybeVirtualFile::File({
-                    let owned_fd = opts.open(path.as_std_path()).await?;
-                    File::from(owned_fd)
-                }))
-            }
-        }
-
-        test_files::<B>("physical_files").await
-    }
-
-    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
-    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
-    /// in trait which benefits from the new lifetime capture rules already.
-    trait Adapter {
-        async fn open(
-            path: Utf8PathBuf,
-            opts: OpenOptions,
-            ctx: &RequestContext,
-        ) -> Result<MaybeVirtualFile, anyhow::Error>;
-    }
-
-    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
-    where
-        A: Adapter,
-    {
        let ctx =
            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files");
        std::fs::create_dir_all(&testdir)?;

+        let zeropad512 = |content: &[u8]| {
+            let mut buf = IoBufferMut::with_capacity_zeroed(512);
+            buf[..content.len()].copy_from_slice(content);
+            buf.freeze().slice_len()
+        };
+
        let path_a = testdir.join("file_a");
-        let mut file_a = A::open(
+        let file_a = VirtualFile::open_with_options_v2(
            path_a.clone(),
            OpenOptions::new()
+                .read(true)
                .write(true)
+                // set create & truncate flags to ensure when we trigger a reopen later in this test,
+                // the reopen_options must have masked out those flags; if they don't, then
+                // the after reopen we will fail to read the `content_a` that we write here.
                .create(true)
-                .truncate(true)
-                .to_owned(),
+                .truncate(true),
            &ctx,
        )
        .await?;
+        let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await;
+        res?;

-        file_a
-            .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx)
-            .await?;
-
-        // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err();
-
-        // Close the file and re-open for reading
-        let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?;
-
-        // cannot write to a file opened in read-only mode
-        let _ = file_a
-            .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx)
-            .await
-            .unwrap_err();
-
-        // Try simple read
-        assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?);
-
-        // Create another test file, and try FileExt functions on it.
        let path_b = testdir.join("file_b");
-        let mut file_b = A::open(
+        let file_b = VirtualFile::open_with_options_v2(
            path_b.clone(),
            OpenOptions::new()
                .read(true)
@@ -1391,37 +1323,44 @@ mod tests {
            &ctx,
        )
        .await?;
-        file_b
-            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
-            .await?;
-        file_b
-            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
-            .await?;
+        let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await;
+        res?;

-        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
+        let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| {
+            let buf = vfile
+                .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx)
+                .await
+                .unwrap();
+            assert_eq!(&buf[..], &zeropad512(expect)[..]);
+        };

-        // Open a lot of files, enough to cause some evictions. (Or to be precise,
-        // open the same file many times. The effect is the same.)
+        // Open a lot of file descriptors / VirtualFile instances.
+        // Enough to cause some evictions in the fd cache.

-        let mut vfiles = Vec::new();
+        let mut file_b_dupes = Vec::new();
        for _ in 0..100 {
-            let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?;
-            assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?);
-            vfiles.push(vfile);
+            let vfile = VirtualFile::open_with_options_v2(
+                path_b.clone(),
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
+            assert_first_512_eq(&vfile, b"content_b").await;
+            file_b_dupes.push(vfile);
        }

        // make sure we opened enough files to definitely cause evictions.
-        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
+        assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2);

        // The underlying file descriptor for 'file_a' should be closed now. Try to read
-        // from it again.
-        assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?);
+        // from it again. The VirtualFile reopens the file internally.
+        assert_first_512_eq(&file_a, b"content_a").await;

        // Check that all the other FDs still work too. Use them in random order for
        // good measure.
-        vfiles.as_mut_slice().shuffle(&mut thread_rng());
-        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
+        file_b_dupes.as_mut_slice().shuffle(&mut thread_rng());
+        for vfile in file_b_dupes.iter_mut() {
+            assert_first_512_eq(vfile, b"content_b").await;
        }

        Ok(())
@@ -1452,7 +1391,7 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFileInner::open_with_options(
+            let f = VirtualFile::open_with_options_v2(
                &test_file_path,
                OpenOptions::new().read(true),
                &ctx,
@@ -1497,8 +1436,6 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_basic() {
-        let ctx =
-            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
        std::fs::create_dir_all(&testdir).unwrap();

@@ -1508,26 +1445,22 @@ mod tests {
        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+
+        let post = std::fs::read_to_string(&path).unwrap();
        assert_eq!(post, "foo");
        assert!(!tmp_path.exists());
-        drop(file);

        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+
+        let post = std::fs::read_to_string(&path).unwrap();
        assert_eq!(post, "bar");
        assert!(!tmp_path.exists());
-        drop(file);
    }

    #[tokio::test]
    async fn test_atomic_overwrite_preexisting_tmp() {
-        let ctx =
-            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir =
            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
        std::fs::create_dir_all(&testdir).unwrap();
@@ -1542,10 +1475,8 @@ mod tests {
            .await
            .unwrap();

-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+        let post = std::fs::read_to_string(&path).unwrap();
        assert_eq!(post, "foo");
        assert!(!tmp_path.exists());
-        drop(file);
    }
 }
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -8,7 +8,13 @@ use super::io_engine::IoEngine;

 #[derive(Debug, Clone)]
 pub struct OpenOptions {
+    /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions`
+    /// to support [`Self::is_write`].
    write: bool,
+    /// We don't expose + pass through a raw `custom_flags()` style API.
+    /// The only custom flag we support is `O_DIRECT`, which we track here
+    /// and map to `custom_flags()` in the [`Self::open`] method.
+    direct: bool,
    inner: Inner,
 }
 #[derive(Debug, Clone)]
@@ -30,6 +36,7 @@ impl Default for OpenOptions {
        };
        Self {
            write: false,
+            direct: false,
            inner,
        }
    }
@@ -44,6 +51,10 @@ impl OpenOptions {
        self.write
    }

+    pub(super) fn is_direct(&self) -> bool {
+        self.direct
+    }
+
    pub fn read(mut self, read: bool) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
@@ -116,13 +127,38 @@ impl OpenOptions {
    }

    pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
-        match &self.inner {
-            Inner::StdFs(x) => x.open(path).map(|file| file.into()),
+        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
+        let mut custom_flags = 0;
+        if self.direct {
            #[cfg(target_os = "linux")]
-            Inner::TokioEpollUring(x) => {
+            {
+                custom_flags |= nix::libc::O_DIRECT;
+            }
+            #[cfg(not(target_os = "linux"))]
+            {
+                // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!).
+                // Just don't set the flag; to catch alignment bugs typical for O_DIRECT,
+                // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`.
+                static WARNING: std::sync::Once = std::sync::Once::new();
+                WARNING.call_once(|| {
+                    let span = tracing::info_span!(parent: None, "open_options");
+                    let _enter = span.enter();
+                    tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process");
+                });
+            }
+        }
+
+        match self.inner.clone() {
+            Inner::StdFs(mut x) => x
+                .custom_flags(custom_flags)
+                .open(path)
+                .map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            Inner::TokioEpollUring(mut x) => {
+                x.custom_flags(custom_flags);
                let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
                let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async {
-                    let res = system.open(path, x).await;
+                    let res = system.open(path, &x).await;
                    ((), res)
                })
                .await;
@@ -144,19 +180,8 @@ impl OpenOptions {
        self
    }

-    pub fn custom_flags(mut self, flags: i32) -> Self {
-        if flags & nix::libc::O_APPEND != 0 {
-            super::io_engine::panic_operation_must_be_idempotent();
-        }
-        match &mut self.inner {
-            Inner::StdFs(x) => {
-                let _ = x.custom_flags(flags);
-            }
-            #[cfg(target_os = "linux")]
-            Inner::TokioEpollUring(x) => {
-                let _ = x.custom_flags(flags);
-            }
-        }
+    pub fn direct(mut self, direct: bool) -> Self {
+        self.direct = direct;
        self
    }
 }
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -425,15 +425,12 @@ compact_prefetch_buffers(void)
 * point inside and outside PostgreSQL.
 *
 * This still does throw errors when it receives malformed responses from PS.
- *
- * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
- * IsHandlingInterrupts) we also report we've ended prefetch receive work,
- * just in case state tracking was lost due to an error in the sync getPage
- * response code.
 */
 void
-communicator_prefetch_pump_state(bool IsHandlingInterrupts)
+communicator_prefetch_pump_state(void)
 {
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive != MyPState->ring_flush)
 	{
 		NeonResponse   *response;
@@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts)
 		}
 	}

-	/* We never pump the prefetch state while handling other pages */
-	if (!IsHandlingInterrupts)
-		END_PREFETCH_RECEIVE_WORK();
+	END_PREFETCH_RECEIVE_WORK();

 	communicator_reconfigure_timeout_if_needed();
 }
@@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index)

 	Assert(MyPState->ring_unused > ring_index);

+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive <= ring_index)
 	{
-		START_PREFETCH_RECEIVE_WORK();
 		entry = GetPrfSlot(MyPState->ring_receive);

 		Assert(entry->status == PRFS_REQUESTED);
@@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index)
 			result = false;
 			break;
 		}
-
-		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
+
 	if (result)
 	{
 		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
 		PrefetchRequest *slot = GetPrfSlot(ring_index);
-		return slot->status == PRFS_RECEIVED;
+		result = slot->status == PRFS_RECEIVED;
 	}
-	return false;
+	END_PREFETCH_RECEIVE_WORK();
+
+	return result;
 ;
 }

@@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
+	Assert(readpage_reentrant_guard);

 	if (slot->status != PRFS_REQUESTED ||
 		slot->response != NULL ||
@@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
 	PrfHashEntry *entry;
 	PrefetchRequest hashkey;

+	Assert(readpage_reentrant_guard);
 	hashkey.buftag = tag;
 	entry = prfh_lookup(MyPState->prf_hash, &hashkey);
 	if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
@@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag)
 void
 prefetch_on_ps_disconnect(void)
 {
+	bool save_readpage_reentrant_guard = readpage_reentrant_guard;
 	MyPState->ring_flush = MyPState->ring_unused;

+	/* Prohibit callig of prefetch_pump_state */
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void)
 		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}

+	/* Restore guard */
+	readpage_reentrant_guard = save_readpage_reentrant_guard;
+
 	/*
 	 * We can have gone into retry due to network error, so update stats with
 	 * the latest available
@@ -2509,7 +2515,7 @@ communicator_processinterrupts(void)
 	if (timeout_signaled)
 	{
 		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
-			communicator_prefetch_pump_state(true);
+			communicator_prefetch_pump_state();

 		timeout_signaled = false;
 		communicator_reconfigure_timeout_if_needed();
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
 										  void *buffer);

 extern void communicator_reconfigure_timeout_if_needed(void);
-extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+extern void communicator_prefetch_pump_state(void);


 #endif
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel)

 		now = GetCurrentTimestamp();
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
-		shard->last_reconnect_time = now;

 		/*
 		 * Make sure we don't do exponential backoff with a constant multiplier
@@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.
+		 *
+		 * This is a loop to protect against interrupted sleeps.
 		 */
-		if (us_since_last_attempt < shard->delay_us)
+		while (us_since_last_attempt < shard->delay_us)
 		{
 			pg_usleep(shard->delay_us - us_since_last_attempt);
+
+			/* At least we should handle cancellations here */
+			CHECK_FOR_INTERRUPTS();
+
+			now = GetCurrentTimestamp();
+			us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		}

 		/* update the delay metric */
 		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+		shard->last_reconnect_time = now;

 		/*
 		 * Connect using the connection string we got from the
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	return false;
 }
@@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	return false;
 }
@@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);

@@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
@@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo

 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -468,12 +468,15 @@ pub async fn handle_request(
    assert!(status.tenant_id == request.tenant_id);
    assert!(status.timeline_id == request.timeline_id);

+    let check_tombstone = !request.ignore_tombstone.unwrap_or_default();
+
    match pull_timeline(
        status,
        safekeeper_host,
        sk_auth_token,
        http_client,
        global_timelines,
+        check_tombstone,
    )
    .await
    {
@@ -499,6 +502,7 @@ async fn pull_timeline(
    sk_auth_token: Option<SecretString>,
    http_client: reqwest::Client,
    global_timelines: Arc<GlobalTimelines>,
+    check_tombstone: bool,
 ) -> Result<PullTimelineResponse> {
    let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
    info!(
@@ -570,7 +574,7 @@ async fn pull_timeline(

    // Finally, load the timeline.
    let _tli = global_timelines
-        .load_temp_timeline(ttid, &tli_dir_path, false)
+        .load_temp_timeline(ttid, &tli_dir_path, check_tombstone)
        .await?;

    Ok(PullTimelineResponse {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3886,10 +3886,10 @@ impl Service {

            None
        } else if safekeepers {
-            // Note that we do not support creating the timeline on the safekeepers
-            // for imported timelines. The `start_lsn` of the timeline is not known
-            // until the import finshes.
-            // https://github.com/neondatabase/neon/issues/11569
+            // Note that for imported timelines, we do not create the timeline on the safekeepers
+            // straight away. Instead, we do it once the import finalized such that we know what
+            // start LSN to provide for the safekeepers. This is done in
+            // [`Self::finalize_timeline_import`].
            let res = self
                .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
                .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id))
@@ -3966,11 +3966,22 @@ impl Service {
                let active = self.timeline_active_on_all_shards(&import).await?;

                match active {
-                    true => {
+                    Some(timeline_info) => {
                        tracing::info!("Timeline became active on all shards");
+
+                        if self.config.timelines_onto_safekeepers {
+                            // Now that we know the start LSN of this timeline, create it on the
+                            // safekeepers.
+                            self.tenant_timeline_create_safekeepers_until_success(
+                                import.tenant_id,
+                                timeline_info,
+                            )
+                            .await?;
+                        }
+
                        break;
                    }
-                    false => {
+                    None => {
                        tracing::info!("Timeline not active on all shards yet");

                        tokio::select! {
@@ -4004,9 +4015,6 @@ impl Service {
            .range_mut(TenantShardId::tenant_range(import.tenant_id))
            .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);

-        // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn,
-        // so we can't create the timeline on the safekeepers. Fix by moving creation here.
-        // https://github.com/neondatabase/neon/issues/11569
        tracing::info!(%import_failed, "Timeline import complete");

        Ok(())
@@ -4021,10 +4029,16 @@ impl Service {
        .await;
    }

+    /// If the timeline is active on all shards, returns the [`TimelineInfo`]
+    /// collected from shard 0.
+    ///
+    /// An error is returned if the shard layout has changed during the import.
+    /// This is guarded against within the storage controller and the pageserver,
+    /// and, therefore, unexpected.
    async fn timeline_active_on_all_shards(
        self: &Arc<Self>,
        import: &TimelineImport,
-    ) -> anyhow::Result<bool> {
+    ) -> anyhow::Result<Option<TimelineInfo>> {
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -4048,13 +4062,17 @@ impl Service {
                        .expect("Pageservers may not be deleted while referenced");
                    targets.push((*tenant_shard_id, node.clone()));
                } else {
-                    return Ok(false);
+                    return Ok(None);
                }
            }

            targets
        };

+        if targets.is_empty() {
+            anyhow::bail!("No shards found to finalize import for");
+        }
+
        let results = self
            .tenant_for_shards_api(
                targets,
@@ -4070,10 +4088,17 @@ impl Service {
            )
            .await;

-        Ok(results.into_iter().all(|res| match res {
+        let all_active = results.iter().all(|res| match res {
            Ok(info) => info.state == TimelineState::Active,
            Err(_) => false,
-        }))
+        });
+
+        if all_active {
+            // Both unwraps are validated above
+            Ok(Some(results.into_iter().next().unwrap().unwrap()))
+        } else {
+            Ok(None)
+        }
    }

    pub(crate) async fn tenant_timeline_archival_config(
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -1,4 +1,9 @@
-use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap,
+    str::FromStr,
+    sync::{Arc, atomic::AtomicU64},
+    time::Duration,
+};

 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
@@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest {
    pub(crate) kind: SafekeeperTimelineOpKind,
 }

+/// A way to keep ongoing/queued reconcile requests apart
+#[derive(Copy, Clone, PartialEq, Eq)]
+struct TokenId(u64);
+
+type OngoingTokens = ClashMap<(TenantId, Option<TimelineId>), (CancellationToken, TokenId)>;
+
 /// Handle to per safekeeper reconciler.
 struct ReconcilerHandle {
-    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
-    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
+    tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>,
+    ongoing_tokens: Arc<OngoingTokens>,
+    token_id_counter: AtomicU64,
    cancel: CancellationToken,
 }

@@ -185,24 +197,28 @@ impl ReconcilerHandle {
        &self,
        tenant_id: TenantId,
        timeline_id: Option<TimelineId>,
-    ) -> CancellationToken {
+    ) -> (CancellationToken, TokenId) {
+        let token_id = self
+            .token_id_counter
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let token_id = TokenId(token_id);
        let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
        if let Entry::Occupied(entry) = &entry {
-            let cancel: &CancellationToken = entry.get();
+            let (cancel, _) = entry.get();
            cancel.cancel();
        }
-        entry.insert(self.cancel.child_token()).clone()
+        entry.insert((self.cancel.child_token(), token_id)).clone()
    }
    /// Cancel an ongoing reconciliation
    fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
-        if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
+        if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
            cancel.cancel();
        }
    }
    fn schedule_reconcile(&self, req: ScheduleRequest) {
-        let cancel = self.new_token_slot(req.tenant_id, req.timeline_id);
+        let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
        let hostname = req.safekeeper.skp.host.clone();
-        if let Err(err) = self.tx.send((req, cancel)) {
+        if let Err(err) = self.tx.send((req, cancel, token_id)) {
            tracing::info!("scheduling request onto {hostname} returned error: {err}");
        }
    }
@@ -211,13 +227,14 @@ impl ReconcilerHandle {
 pub(crate) struct SafekeeperReconciler {
    inner: SafekeeperReconcilerInner,
    concurrency_limiter: Arc<Semaphore>,
-    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
+    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>,
    cancel: CancellationToken,
 }

 /// Thin wrapper over `Service` to not clutter its inherent functions
 #[derive(Clone)]
 struct SafekeeperReconcilerInner {
+    ongoing_tokens: Arc<OngoingTokens>,
    service: Arc<Service>,
 }

@@ -226,15 +243,20 @@ impl SafekeeperReconciler {
        // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
        let (tx, rx) = mpsc::unbounded_channel();
        let concurrency = service.config.safekeeper_reconciler_concurrency;
+        let ongoing_tokens = Arc::new(ClashMap::new());
        let mut reconciler = SafekeeperReconciler {
-            inner: SafekeeperReconcilerInner { service },
+            inner: SafekeeperReconcilerInner {
+                service,
+                ongoing_tokens: ongoing_tokens.clone(),
+            },
            rx,
            concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
            cancel: cancel.clone(),
        };
        let handle = ReconcilerHandle {
            tx,
-            ongoing_tokens: Arc::new(ClashMap::new()),
+            ongoing_tokens,
+            token_id_counter: AtomicU64::new(0),
            cancel,
        };
        tokio::spawn(async move { reconciler.run().await });
@@ -246,7 +268,9 @@ impl SafekeeperReconciler {
                req = self.rx.recv() => req,
                _ = self.cancel.cancelled() => break,
            };
-            let Some((req, req_cancel)) = req else { break };
+            let Some((req, req_cancel, req_token_id)) = req else {
+                break;
+            };

            let permit_res = tokio::select! {
                req = self.concurrency_limiter.clone().acquire_owned() => req,
@@ -265,7 +289,7 @@ impl SafekeeperReconciler {
                let timeline_id = req.timeline_id;
                let node_id = req.safekeeper.skp.id;
                inner
-                    .reconcile_one(req, req_cancel)
+                    .reconcile_one(req, req_cancel, req_token_id)
                    .instrument(tracing::info_span!(
                        "reconcile_one",
                        ?kind,
@@ -280,8 +304,14 @@ impl SafekeeperReconciler {
 }

 impl SafekeeperReconcilerInner {
-    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
+    async fn reconcile_one(
+        &self,
+        req: ScheduleRequest,
+        req_cancel: CancellationToken,
+        req_token_id: TokenId,
+    ) {
        let req_host = req.safekeeper.skp.host.clone();
+        let success;
        match req.kind {
            SafekeeperTimelineOpKind::Pull => {
                let Some(timeline_id) = req.timeline_id else {
@@ -301,20 +331,24 @@ impl SafekeeperReconcilerInner {
                    http_hosts,
                    tenant_id: req.tenant_id,
                    timeline_id,
+                    ignore_tombstone: Some(false),
                };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.pull_timeline(&pull_req).await,
-                    |resp| {
-                        if let Some(host) = resp.safekeeper_host {
-                            tracing::info!("pulled timeline from {host} onto {req_host}");
-                        } else {
-                            tracing::info!("timeline already present on safekeeper on {req_host}");
-                        }
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.pull_timeline(&pull_req).await,
+                        |resp| {
+                            if let Some(host) = resp.safekeeper_host {
+                                tracing::info!("pulled timeline from {host} onto {req_host}");
+                            } else {
+                                tracing::info!(
+                                    "timeline already present on safekeeper on {req_host}"
+                                );
+                            }
+                        },
+                        req_cancel,
+                    )
+                    .await;
            }
            SafekeeperTimelineOpKind::Exclude => {
                // TODO actually exclude instead of delete here
@@ -325,22 +359,23 @@ impl SafekeeperReconcilerInner {
                    );
                    return;
                };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.delete_timeline(tenant_id, timeline_id).await,
-                    |_resp| {
-                        tracing::info!("deleted timeline from {req_host}");
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
+                        |_resp| {
+                            tracing::info!("deleted timeline from {req_host}");
+                        },
+                        req_cancel,
+                    )
+                    .await;
            }
            SafekeeperTimelineOpKind::Delete => {
                let tenant_id = req.tenant_id;
                if let Some(timeline_id) = req.timeline_id {
-                    let deleted = self
+                    success = self
                        .reconcile_inner(
-                            req,
+                            &req,
                            async |client| client.delete_timeline(tenant_id, timeline_id).await,
                            |_resp| {
                                tracing::info!("deleted timeline from {req_host}");
@@ -348,13 +383,13 @@ impl SafekeeperReconcilerInner {
                            req_cancel,
                        )
                        .await;
-                    if deleted {
+                    if success {
                        self.delete_timeline_from_db(tenant_id, timeline_id).await;
                    }
                } else {
-                    let deleted = self
+                    success = self
                        .reconcile_inner(
-                            req,
+                            &req,
                            async |client| client.delete_tenant(tenant_id).await,
                            |_resp| {
                                tracing::info!(%tenant_id, "deleted tenant from {req_host}");
@@ -362,12 +397,21 @@ impl SafekeeperReconcilerInner {
                            req_cancel,
                        )
                        .await;
-                    if deleted {
+                    if success {
                        self.delete_tenant_timelines_from_db(tenant_id).await;
                    }
                }
            }
        }
+        if success {
+            self.ongoing_tokens.remove_if(
+                &(req.tenant_id, req.timeline_id),
+                |_ttid, (_cancel, token_id)| {
+                    // Ensure that this request is indeed the request we just finished and not a new one
+                    req_token_id == *token_id
+                },
+            );
+        }
    }
    async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
        match self
@@ -421,10 +465,10 @@ impl SafekeeperReconcilerInner {
            self.delete_timeline_from_db(tenant_id, timeline_id).await;
        }
    }
-    /// Returns whether the reconciliation happened successfully
+    /// Returns whether the reconciliation happened successfully (or we got cancelled)
    async fn reconcile_inner<T, F, U>(
        &self,
-        req: ScheduleRequest,
+        req: &ScheduleRequest,
        closure: impl Fn(SafekeeperClient) -> F,
        log_success: impl FnOnce(T) -> U,
        req_cancel: CancellationToken,
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -323,6 +323,42 @@ impl Service {
        })
    }

+    pub(crate) async fn tenant_timeline_create_safekeepers_until_success(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_info: TimelineInfo,
+    ) -> anyhow::Result<()> {
+        const BACKOFF: Duration = Duration::from_secs(5);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                anyhow::bail!("Shut down requested while finalizing import");
+            }
+
+            let res = self
+                .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
+                .await;
+
+            match res {
+                Ok(_) => {
+                    tracing::info!("Timeline created on safekeepers");
+                    break;
+                }
+                Err(err) => {
+                    tracing::error!("Failed to create timeline on safekeepers: {err}");
+                    tokio::select! {
+                        _ = self.cancel.cancelled() => {
+                            anyhow::bail!("Shut down requested while finalizing import");
+                        },
+                        _ = tokio::time::sleep(BACKOFF) => {}
+                    };
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    /// Directly insert the timeline into the database without reconciling it with safekeepers.
    ///
    /// Useful if the timeline already exists on the specified safekeepers,
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -355,6 +355,7 @@ pub(crate) async fn list_timeline_blobs(
    match res {
        ListTimelineBlobsResult::Ready(data) => Ok(data),
        ListTimelineBlobsResult::MissingIndexPart(_) => {
+            tracing::warn!("listing raced with removal of an index, retrying");
            // Retry if listing raced with removal of an index
            let data = list_timeline_blobs_impl(remote_client, id, root_target)
                .await?
@@ -441,7 +442,7 @@ async fn list_timeline_blobs_impl(
    }

    if index_part_keys.is_empty() && s3_layers.is_empty() {
-        tracing::debug!("Timeline is empty: expected post-deletion state.");
+        tracing::info!("Timeline is empty: expected post-deletion state.");
        if initdb_archive {
            tracing::info!("Timeline is post deletion but initdb archive is still present.");
        }
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -593,6 +593,7 @@ async fn gc_timeline(
            index_part_snapshot_time: _,
        } => (index_part, *index_part_generation, data.unused_index_keys),
        BlobDataParseResult::Relic => {
+            tracing::info!("Skipping timeline {ttid}, it is a relic");
            // Post-deletion tenant location: don't try and GC it.
            return Ok(summary);
        }
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli):
        endpoint_id: str,
        safekeepers_generation: int | None = None,
        safekeepers: list[int] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
        create_test_user: bool = False,
@@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli):
        extra_env_vars = env or {}
        if basebackup_request_tries is not None:
            extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
-        if remote_ext_config is not None:
-            args.extend(["--remote-ext-config", remote_ext_config])
+        if remote_ext_base_url is not None:
+            args.extend(["--remote-ext-base-url", remote_ext_base_url])

        if safekeepers_generation is not None:
            args.extend(["--safekeepers-generation", str(safekeepers_generation)])
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1274,6 +1274,8 @@ class NeonEnv:

            if self.pageserver_virtual_file_io_engine is not None:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+            if self.pageserver_virtual_file_io_mode is not None:
+                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
            if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                tenant_config = ps_cfg.setdefault("tenant_config", {})
                tenant_config["compaction_algorithm"] = (
@@ -4195,7 +4197,7 @@ class Endpoint(PgProtocol, LogUtils):

    def start(
        self,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        safekeeper_generation: int | None = None,
        safekeepers: list[int] | None = None,
@@ -4221,7 +4223,7 @@ class Endpoint(PgProtocol, LogUtils):
            self.endpoint_id,
            safekeepers_generation=safekeeper_generation,
            safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            create_test_user=create_test_user,
@@ -4436,7 +4438,7 @@ class Endpoint(PgProtocol, LogUtils):
        hot_standby: bool = False,
        lsn: Lsn | None = None,
        config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
        basebackup_request_tries: int | None = None,
@@ -4455,7 +4457,7 @@ class Endpoint(PgProtocol, LogUtils):
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
        ).start(
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            basebackup_request_tries=basebackup_request_tries,
@@ -4539,7 +4541,7 @@ class EndpointFactory:
        lsn: Lsn | None = None,
        hot_standby: bool = False,
        config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        basebackup_request_tries: int | None = None,
    ) -> Endpoint:
@@ -4559,7 +4561,7 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            lsn=lsn,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
            pageserver_id=pageserver_id,
            basebackup_request_tries=basebackup_request_tries,
        )
@@ -4613,7 +4615,10 @@ class EndpointFactory:
        return self

    def new_replica(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
    ):
        branch_name = origin.branch_name
        assert origin in self.endpoints
@@ -4629,7 +4634,10 @@ class EndpointFactory:
        )

    def new_replica_start(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
    ):
        branch_name = origin.branch_name
        assert origin in self.endpoints
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*stalling layer flushes for compaction backpressure.*",
    ".*layer roll waiting for flush due to compaction backpressure.*",
    ".*BatchSpanProcessor.*",
+    *(
+        [
+            r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"
+        ]
+        if sys.platform != "linux"
+        else []
+    ),
 )


--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -221,7 +221,7 @@ def test_remote_extensions(

    endpoint.create_remote_extension_spec(spec)

-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)

    with endpoint.connect() as conn:
        with conn.cursor() as cur:
@@ -249,7 +249,7 @@ def test_remote_extensions(
    # Remove the extension files to force a redownload of the extension.
    extension.remove(test_output_dir, pg_version)

-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)

    # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
    with endpoint.connect() as conn:
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -24,6 +24,7 @@ from fixtures.utils import (
    skip_in_debug_build,
    wait_until,
 )
+from fixtures.workload import Workload
 from mypy_boto3_kms import KMSClient
 from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
 from mypy_boto3_s3 import S3Client
@@ -97,6 +98,10 @@ def test_pgdata_import_smoke(
        f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
    )

+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {}
+    neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True
+
    env = neon_env_builder.init_start()

    # The test needs LocalFs support, which is only built in testing mode.
@@ -286,34 +291,28 @@ def test_pgdata_import_smoke(
    #
    # validate that we can write
    #
-    rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name,
-        endpoint_id="rw",
-        tenant_id=tenant_id,
-        config_lines=ep_config,
-    )
-    rw_endpoint.safe_psql("create table othertable(values text)")
-    rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name)
+    workload.init()
+    workload.write_rows(64)
+    workload.validate()

-    # TODO: consider using `class Workload` here
-    # to do compaction and whatnot?
+    rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()"))

    #
    # validate that we can branch (important use case)
    #

    # ... at the tip
-    _ = env.create_branch(
+    child_timeline_id = env.create_branch(
        new_branch_name="br-tip",
        ancestor_branch_name=import_branch_name,
        tenant_id=tenant_id,
        ancestor_start_lsn=rw_lsn,
    )
-    br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
-    )
-    validate_vanilla_equivalence(br_tip_endpoint)
-    br_tip_endpoint.safe_psql("select * from othertable")
+    child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
+    child_workload.validate()
+
+    validate_vanilla_equivalence(child_workload.endpoint())

    # ... at the initdb lsn
    _ = env.create_branch(
@@ -330,7 +329,7 @@ def test_pgdata_import_smoke(
    )
    validate_vanilla_equivalence(br_initdb_endpoint)
    with pytest.raises(psycopg2.errors.UndefinedTable):
-        br_initdb_endpoint.safe_psql("select * from othertable")
+        br_initdb_endpoint.safe_psql(f"select * from {workload.table}")


@run_only_on_default_postgres(reason="PG version is irrelevant here")
--- a/test_runner/regress/test_role_grants.py
+++ b/test_runner/regress/test_role_grants.py
@@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv):
        res = cur.fetchall()

        assert res == [(1,)], "select should not succeed"
+
+    # confirm that replicas can also ensure the grants are correctly set.
+    replica = env.endpoints.new_replica_start(endpoint)
+    replica_client = replica.http_client()
+    replica_client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,7 +5,7 @@
  ],
  "v16": [
    "16.8",
-    "05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
+    "d72d76f2cdee4194dd052ce099e9784aca7c794a"
  ],
  "v15": [
    "15.12",