Revert "work around copy_to_user failure on reads"

Not necessary as of 271b19bd3e Reason why that commit fixes the issues is https://neondb.slack.com/archives/C08SXUSNFBP/p1747837985950309 > implemented a fix in the kernel module, no need for userspace to > pre-fault mmapped pages in; the reason for the EFAULT was that we were > holding a spinlock during copy_to_user, which disables preemption, which > in turn makes the page fault handler give up and cause copy_to_user to > fail; commit to be used is 271b19bd3e2de7777770ac6b8b1b1c94bb33830b > (edited) This reverts commit 7b818f8d64.
demote log levels to avoid flooding the logs
2026-05-16 12:40:36 +00:00 · 2025-05-23 12:11:01 +02:00 · 2025-05-23 12:10:53 +02:00 · 2025-05-21 16:33:52 +03:00 · 2025-05-21 01:33:58 +03:00 · 2025-05-21 01:33:43 +03:00
27 changed files with 722 additions and 90 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4434,6 +4434,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "tonic",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
+    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -15,6 +15,10 @@ commands:
    user: root
    sysvInitAction: sysinit
    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
+  - name: chmod-clockcache_dev
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 777 /dev/clockcache_dev' # FIXME: not very secure
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -213,8 +213,10 @@ impl Escaping for PgIdent {

        // Find the first suitable tag that is not present in the string.
        // Postgres' max role/DB name length is 63 bytes, so even in the
-        // worst case it won't take long.
-        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
+        // worst case it won't take long. Outer tag is always `tag + "x"`,
+        // so if `tag` is not present in the string, `outer_tag` is not
+        // present in the string either.
+        while self.contains(&tag.to_string()) {
            tag += "x";
            outer_tag = tag.clone() + "x";
        }
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -71,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
            ("name$$$", ("$x$name$$$$x$", "xx")),
            ("name$$$$", ("$x$name$$$$$x$", "xx")),
            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
+            ("x", ("$xx$x$xx$", "xxx")),
+            ("xx", ("$xxx$xx$xxx$", "xxxx")),
+            ("$x", ("$xx$$x$xx$", "xxx")),
+            ("x$", ("$xx$x$$xx$", "xxx")),
+            ("$x$", ("$xx$$x$$xx$", "xxx")),
+            ("xx$", ("$xxx$xx$$xxx$", "xxxx")),
+            ("$xx", ("$xxx$$xx$xxx$", "xxxx")),
+            ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
        ];

        for (input, expected) in test_cases {
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        if var(REAL_S3_ENV).is_ok() {
            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
        }
+
+        #[cfg(target_os = "linux")]
        assert!(body.contains("process_threads"));
    }

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -235,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
    ScatteredLsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,13 @@
+use std::env;
+use std::path::PathBuf;
+
+/// Generates Rust code from .proto Protobuf schemas, along with a binary file
+/// descriptor set for Protobuf schema reflection.
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
+    tonic_build::configure()
+        .bytes(["."])
+        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,233 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// The service can be accessed via e.g. grpcurl:
+//
+//    ```
+//    grpcurl \
+//      -plaintext \
+//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
+//      -H "neon-shard-id: 0b10" \
+//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
+//      -H "authorization: Bearer $JWT" \
+//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
+//      localhost:51051 page_api.PageService/CheckRelExists
+//    ```
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_api;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,19 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_api");
+
+    /// File descriptor set for Protobuf schema reflection. This allows using
+    /// e.g. grpcurl with the API.
+    pub const FILE_DESCRIPTOR_SET: &[u8] =
+        tonic::include_file_descriptor_set!("page_api_descriptor");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -144,7 +144,7 @@ where
        replica,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf,
+            timeline.conf.get_vectored_concurrent_io,
            timeline
                .gate
                .enter()
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3199,7 +3199,7 @@ async fn list_aux_files(
            .await?;

    let io_concurrency = IoConcurrency::spawn_from_conf(
-        state.conf,
+        state.conf.get_vectored_concurrent_io,
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
-    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
@@ -331,10 +331,10 @@ async fn page_service_conn_main(
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(
-        conf,
        tenant_manager,
        auth,
        pipelining_config,
+        conf.get_vectored_concurrent_io,
        perf_span_fields,
        connection_ctx,
        cancel.clone(),
@@ -371,7 +371,6 @@ async fn page_service_conn_main(
 }

 struct PageServerHandler {
-    conf: &'static PageServerConf,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -389,6 +388,7 @@ struct PageServerHandler {
    timeline_handles: Option<TimelineHandles>,

    pipelining_config: PageServicePipeliningConfig,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,

    gate_guard: GateGuard,
 }
@@ -844,17 +844,16 @@ impl BatchedFeMessage {
 impl PageServerHandler {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
-        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
    ) -> Self {
        PageServerHandler {
-            conf,
            auth,
            claims: None,
            connection_ctx,
@@ -862,6 +861,7 @@ impl PageServerHandler {
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
+            get_vectored_concurrent_io,
            gate_guard,
        }
    }
@@ -1623,7 +1623,7 @@ impl PageServerHandler {
        }

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.get_vectored_concurrent_io,
            match self.gate_guard.try_clone() {
                Ok(guard) => guard,
                Err(_) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -586,7 +586,7 @@ impl Timeline {
        // scan directory listing (new), merge with the old results
        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -645,7 +645,7 @@ impl Timeline {
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -885,7 +885,7 @@ impl Timeline {
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| PageReconstructError::Cancelled)?,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -8596,8 +8596,10 @@ mod tests {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let io_concurrency =
-            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            tline.conf.get_vectored_concurrent_io,
+            tline.gate.enter().unwrap(),
+        );
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -31,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer;
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
+use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
@@ -43,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
-use crate::config::PageServerConf;
 use crate::context::{
    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
@@ -318,11 +318,10 @@ impl IoConcurrency {
    }

    pub(crate) fn spawn_from_conf(
-        conf: &'static PageServerConf,
+        conf: GetVectoredConcurrentIo,
        gate_guard: GateGuard,
    ) -> IoConcurrency {
-        use pageserver_api::config::GetVectoredConcurrentIo;
-        let selected = match conf.get_vectored_concurrent_io {
+        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
        };
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3530,7 +3530,7 @@ impl Timeline {
                };

                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    self_ref.conf,
+                    self_ref.conf.get_vectored_concurrent_io,
                    self_ref
                        .gate
                        .enter()
@@ -5559,7 +5559,7 @@ impl Timeline {
            });

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
    );
    let io_concurrency = IoConcurrency::spawn_from_conf(
-        detached.conf,
+        detached.conf.get_vectored_concurrent_io,
        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
    );
    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -12,6 +12,7 @@
 #include "postgres.h"

 #include <sys/file.h>
+#include <sys/ioctl.h>
 #include <unistd.h>
 #include <fcntl.h>

@@ -52,6 +53,10 @@
 #include "pagestore_client.h"
 #include "communicator.h"

+/* For the kernel module */
+#include "neon_pagecache.h"
+#define CLOCKCACHE_DEV_PATH "/dev/clockcache_dev"
+
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

 /*
@@ -159,6 +164,13 @@ typedef struct FileCacheControl
 	uint64		time_write;		/* time spent writing (us) */
 	uint64		resizes;        /* number of LFC resizes   */
 	uint64		evicted_pages;	/* number of evicted pages */
+
+	/* FIXME: should make these atomic, they're not protected by any locks */
+	uint64		kernel_module_read_hits;	/* success returns from read ioctl */
+	uint64		kernel_module_read_misses;	/* ENOENT returns from read ioctl */
+	uint64		kernel_module_write_hits;	/* success returns from write ioctl */
+	uint64		kernel_module_write_misses;	/* ENOMEM returns from write ioctl */
+
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
@@ -183,6 +195,7 @@ typedef struct FileCacheControl
 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
+
 static int	lfc_max_size;
 static int	lfc_size_limit;
 static int	lfc_prewarm_limit;
@@ -190,6 +203,8 @@ static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
 static char *lfc_path;
+static bool lfc_use_kernel_module;
+
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
@@ -203,6 +218,9 @@ bool lfc_prewarm_update_ws_estimation;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

+static int pread_with_ioctl(void *buffer, uint64 blkno);
+static int pwrite_with_ioctl(const void *buffer, uint64 blkno);
+
 /*
 * Close LFC file if opened.
 * All backends should close their LFC files once LFC is disabled.
@@ -251,14 +269,19 @@ lfc_switch_off(void)
 		/*
 		 * We need to use unlink to to avoid races in LFC write, because it is not
 		 * protected by lock
+		 *
+		 * FIXME: how to clean up the kernel module device on trouble?
 		 */
-		unlink(lfc_path);
+		if (!lfc_use_kernel_module)
+		{
+			unlink(lfc_path);

-		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
-		if (fd < 0)
-			elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
-		else
-			close(fd);
+			fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
+			if (fd < 0)
+				elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
+			else
+				close(fd);
+		}

 		/* Wakeup waiting backends */
 		for (int i = 0; i < N_COND_VARS; i++)
@@ -270,7 +293,8 @@ lfc_switch_off(void)
 static void
 lfc_disable(char const *op)
 {
-	elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
+	elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache",
+		 op, lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 	lfc_switch_off();
@@ -301,7 +325,9 @@ lfc_ensure_opened(void)
 	/* Open cache file if not done yet */
 	if (lfc_desc < 0)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
+		lfc_desc = BasicOpenFile(
+			lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path,
+			O_RDWR);

 		if (lfc_desc < 0)
 		{
@@ -351,10 +377,16 @@ lfc_shmem_startup(void)
 		initSHLL(&lfc_ctl->wss_estimation);

 		/* Recreate file cache on restart */
-		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
+		if (lfc_use_kernel_module)
+			fd = BasicOpenFile(CLOCKCACHE_DEV_PATH, O_RDWR);
+		else
+			fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
-			elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
+			if (lfc_use_kernel_module)
+				elog(WARNING, "LFC: failed to open " CLOCKCACHE_DEV_PATH ": %m");
+			else
+				elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
 			lfc_ctl->limit = 0;
 		}
 		else
@@ -613,6 +645,15 @@ lfc_init(void)
 							NULL,
 							NULL);

+	DefineCustomBoolVariable("neon.use_kernel_module",
+							 "Use neon_pagecache kernel module instead of a regular file (EXPERIMENTAL)",
+							 NULL,
+							 &lfc_use_kernel_module,
+							 true,
+							 PGC_POSTMASTER,
+							 0,
+							 NULL, NULL, NULL);
+
 	if (lfc_max_size == 0)
 		return;

@@ -1297,27 +1338,57 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			/* offset of first IOV */
 			first_read_offset += chunk_offs + first_block_in_chunk_read;

-			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
-
 			/* Read only the blocks we're interested in, limiting */
-			rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
-						nwrite, first_read_offset * BLCKSZ);
-			pgstat_report_wait_end();
-
-			if (rc != (BLCKSZ * nwrite))
+			if (lfc_use_kernel_module)
 			{
-				lfc_disable("read");
-				return -1;
+				pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
+				for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+				{
+					if (!BITMAP_ISSET(chunk_mask, i))
+						continue;
+					Assert(iov[i].iov_len == BLCKSZ);
+					rc = pread_with_ioctl(iov[i].iov_base, first_read_offset + i - first_block_in_chunk_read);
+					if (rc < 0 && errno == ENOENT)
+					{
+						/* The kernel module evicted the page */
+						elog(DEBUG1, "kernel module had evicted block");
+					}
+					else if (rc < 0)
+					{
+						pgstat_report_wait_end();
+						lfc_disable("ioctl read");
+						return -1;
+					}
+					else
+					{
+						/* success! */
+						BITMAP_SET(mask, buf_offset + i);
+					}
+				}
+				pgstat_report_wait_end();
 			}
-
-			/*
-			 * We successfully read the pages we know were valid when we
-			 * started reading; now mark those pages as read
-			 */
-			for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+			else
 			{
-				if (BITMAP_ISSET(chunk_mask, i))
-					BITMAP_SET(mask, buf_offset + i);
+				pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
+				rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
+							nwrite, first_read_offset * BLCKSZ);
+				pgstat_report_wait_end();
+
+				if (rc != (BLCKSZ * nwrite))
+				{
+					lfc_disable("read");
+					return -1;
+				}
+
+				/*
+				 * We successfully read the pages we know were valid when we
+				 * started reading; now mark those pages as read
+				 */
+				for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+				{
+					if (BITMAP_ISSET(chunk_mask, i))
+						BITMAP_SET(mask, buf_offset + i);
+				}
 			}
 		}

@@ -1364,6 +1435,65 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return blocks_read;
 }

+static int
+pread_with_ioctl(void *buffer, uint64 blkno)
+{
+	struct neon_rw_args args = {
+		.key = {
+			.key_hi = 0,
+			.key_lo = blkno
+		},
+		.offset = 0,
+		.length = POSTGRES_PAGE_SIZE,
+		.buffer = (__u64)(uintptr_t) buffer
+	};
+	int			rc;
+
+	errno = 0;
+
+	elog(LOG, "calling ioctl read for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
+		 blkno,
+		 buffer,
+		 BufferBlocks,
+		 BufferBlocks + BLCKSZ * NBuffers);
+	rc = ioctl(lfc_desc, NEON_IOCTL_READ, &args);
+	if (rc >= 0)
+		lfc_ctl->kernel_module_read_hits++;
+	else if (rc < 0 && errno == ENOENT)
+		lfc_ctl->kernel_module_read_misses++;
+	else
+		elog(LOG, "ioctl read failed for blk %lu with buffer=%p: %m", blkno, buffer);
+	return rc;
+}
+
+static int
+pwrite_with_ioctl(const void *buffer, uint64 blkno)
+{
+	struct neon_rw_args args = {
+		.key = {
+			.key_hi = 0,
+			.key_lo = blkno
+		},
+		.offset = 0,
+		.length = POSTGRES_PAGE_SIZE,
+		.buffer = (__u64)(uintptr_t) buffer
+	};
+	int			rc;
+
+	elog(LOG, "calling ioctl write for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
+		 blkno,
+		 buffer,
+		 BufferBlocks,
+		 BufferBlocks + BLCKSZ * NBuffers);
+
+	rc = ioctl(lfc_desc, NEON_IOCTL_WRITE, &args);
+	if (rc >= 0)
+		lfc_ctl->kernel_module_write_hits++;
+	else if (rc < 0 && errno == ENOMEM)
+		lfc_ctl->kernel_module_write_misses++;
+	return rc;
+}
+
 /*
 * Initialize new LFC hash entry, perform eviction if needed.
 * Returns false if there are no unpinned entries and chunk can not be added.
@@ -1484,7 +1614,6 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
-	ssize_t		rc;
 	bool		found;
 	uint32		hash;
 	uint64		generation;
@@ -1493,6 +1622,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	ConditionVariable* cv;
 	FileCacheBlockState state;
 	XLogRecPtr lwlsn;
+	bool		success;

 	int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);

@@ -1571,16 +1701,60 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-	pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
-	INSTR_TIME_SET_CURRENT(io_start);
-	rc = pwrite(lfc_desc, buffer, BLCKSZ,
-				((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
-	INSTR_TIME_SET_CURRENT(io_end);
-	pgstat_report_wait_end();
-
-	if (rc != BLCKSZ)
+	if (lfc_use_kernel_module)
 	{
-		lfc_disable("write");
+		int			rc;
+
+		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+		INSTR_TIME_SET_CURRENT(io_start);
+		rc = pwrite_with_ioctl(buffer,
+							   entry_offset * lfc_blocks_per_chunk + chunk_offs);
+		INSTR_TIME_SET_CURRENT(io_end);
+		pgstat_report_wait_end();
+
+		if (rc < 0 && errno == ENOMEM)
+		{
+			/*
+			 * Write was wasted.
+			 *
+			 * FIXME: We could mark the page in the chunk as UNAVAILABLE,
+			 * since we know it was not actually present in the kernel
+			 * cache. Any subsequent read on it will inevitably fail with
+			 * ENOENT. That's not a correctness issue however, assuming that
+			 * the call never returns ENOMEM when the old version of the page
+			 * is still in the cache.
+			 */
+			success = true;
+		}
+		else if (rc < 0)
+		{
+			success = false;
+		}
+		else
+		{
+			/* successful write */
+			success = true;
+		}
+	}
+	else
+	{
+		ssize_t rc;
+
+		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+		INSTR_TIME_SET_CURRENT(io_start);
+
+		rc = pwrite(lfc_desc, buffer, BLCKSZ,
+					((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+
+		INSTR_TIME_SET_CURRENT(io_end);
+		pgstat_report_wait_end();
+
+		success = (rc == BLCKSZ);
+	}
+
+	if (!success)
+	{
+		lfc_disable(lfc_use_kernel_module ? "write ioctl" : "write");
 	}
 	else
 	{
@@ -1756,19 +1930,60 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 		LWLockRelease(lfc_lock);

-		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
-		INSTR_TIME_SET_CURRENT(io_start);
-		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
-					 ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
-		INSTR_TIME_SET_CURRENT(io_end);
-		pgstat_report_wait_end();
-
-		if (rc != BLCKSZ * blocks_in_chunk)
+		/* Perform the write */
+		if (lfc_use_kernel_module)
 		{
-			lfc_disable("write");
-			return;
+			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+			INSTR_TIME_SET_CURRENT(io_start);
+			for (int i = 0; i < blocks_in_chunk; i++)
+			{
+				int			rc;
+
+				rc = pwrite_with_ioctl(
+					iov[i].iov_base,
+					entry_offset * lfc_blocks_per_chunk + chunk_offs
+					);
+				if (rc < 0 && errno == ENOMEM)
+				{
+					/*
+					 * Write was wasted.
+					 *
+					 * FIXME: We could mark the page in the chunk as UNAVAILABLE,
+					 * since we know it was not actually present in the kernel
+					 * cache. Any subsequent read on it will inevitably fail with
+					 * ENOENT. That's not a correctness issue however, assuming that
+					 * the call never returns ENOMEM when the old version of the page
+					 * is still in the cache.
+					 */
+				}
+				else if (rc < 0)
+				{
+					/* other error, not expected */
+					INSTR_TIME_SET_CURRENT(io_end);
+					pgstat_report_wait_end();
+					lfc_disable("write ioctl");
+					return;
+				}
+			}
+			INSTR_TIME_SET_CURRENT(io_end);
+			pgstat_report_wait_end();
 		}
 		else
+		{
+			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+			INSTR_TIME_SET_CURRENT(io_start);
+			rc = pwritev(lfc_desc, iov, blocks_in_chunk,
+						 ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+			INSTR_TIME_SET_CURRENT(io_end);
+			pgstat_report_wait_end();
+			if (rc != BLCKSZ * blocks_in_chunk)
+			{
+				lfc_disable("write");
+				return;
+			}
+		}
+
+		/* success */
 		{
 			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

@@ -1922,6 +2137,26 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->pinned;
 			break;
+		case 10:
+			key = "file_cache_kernel_module_read_hits";
+			if (lfc_ctl)
+				value = lfc_ctl->kernel_module_read_hits;
+			break;
+		case 11:
+			key = "file_cache_kernel_module_read_misses";
+			if (lfc_ctl)
+				value = lfc_ctl->kernel_module_read_misses;
+			break;
+		case 12:
+			key = "file_cache_kernel_module_write_hits";
+			if (lfc_ctl)
+				value = lfc_ctl->kernel_module_write_hits;
+			break;
+		case 13:
+			key = "file_cache_kernel_module_write_misses";
+			if (lfc_ctl)
+				value = lfc_ctl->kernel_module_write_misses;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
--- a/pgxn/neon/neon_pagecache.h
+++ b/pgxn/neon/neon_pagecache.h
@@ -0,0 +1,35 @@
+/*
+ * This is for the special ioctl in the neon_pagecache kernel module.
+ *
+ * DO NOT MODIFY! This header must agree with what the kernel module was
+ * compiled with!
+ */
+
+#ifndef NEON_PAGECACHE_H
+#define NEON_PAGECACHE_H
+
+#include <linux/types.h>
+
+
+#define POSTGRES_PAGE_SIZE 8192   // 8 KiB
+
+struct neon_key {
+    __u64 key_hi;     // Upper 64 bits of 128-bit key
+    __u64 key_lo;     // Lower 64 bits of 128-bit key
+};
+
+struct neon_rw_args {
+    struct neon_key key;
+    __u32 offset;     // Offset within page (0-8191)
+    __u32 length;     // Length to read/write
+    __u64 buffer;     // User buffer address
+};
+
+#define NEON_IOC_MAGIC 'N'
+
+#define NEON_IOCTL_READ    _IOWR(NEON_IOC_MAGIC, 1, struct neon_rw_args)
+#define NEON_IOCTL_WRITE   _IOWR(NEON_IOC_MAGIC, 2, struct neon_rw_args)
+
+
+
+#endif /* NEON_PAGECACHE_H */
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -161,8 +161,11 @@ struct ProxyCliArgs {
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value = "1024")]
+    #[clap(long, default_value_t = 1024)]
    cancellation_ch_size: usize,
+    /// Cancellation ops batch size for redis
+    #[clap(long, default_value_t = 8)]
+    cancellation_batch_size: usize,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
@@ -542,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+                    handle_cancel_messages(
+                        &mut redis_kv_client,
+                        rx_cancel,
+                        args.cancellation_batch_size,
+                    )
+                    .await?;

                    drop(redis_kv_client);

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
 type IpSubnetKey = IpNet;

 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
-const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
-const BATCH_SIZE: usize = 8;

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -231,12 +229,13 @@ impl CancelReplyOp {
 pub async fn handle_cancel_messages(
    client: &mut RedisKVClient,
    mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::with_capacity(BATCH_SIZE);
-    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);

    loop {
-        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
            warn!("shutting down cancellation queue");
            break Ok(());
        }
@@ -367,8 +366,7 @@ impl CancellationHandler {
            return Err(CancelError::InternalError);
        };

-        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
-            .await
+        tx.try_send(op)
            .map_err(|e| {
                tracing::warn!("failed to send GetCancelData for {key}: {e}");
            })
@@ -570,7 +568,7 @@ impl Session {
    }

    // Send the store key op to the cancellation handler and set TTL for the key
-    pub(crate) async fn write_cancel_key(
+    pub(crate) fn write_cancel_key(
        &self,
        cancel_closure: CancelClosure,
    ) -> Result<(), CancelError> {
@@ -596,14 +594,14 @@ impl Session {
            expire: CANCEL_KEY_TTL,
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
        });
        Ok(())
    }

-    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
@@ -619,7 +617,7 @@ impl Session {
                .guard(RedisMsgKind::HDel),
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
        });
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -244,9 +244,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -383,9 +383,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
        }

-        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error

        res
    }
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -19,6 +19,16 @@ TEST_ROLE_NAMES = [
    {"name": "role$"},
    {"name": "role$$"},
    {"name": "role$x$"},
+    {"name": "x"},
+    {"name": "xx"},
+    {"name": "$x"},
+    {"name": "x$"},
+    {"name": "$x$"},
+    {"name": "xx$"},
+    {"name": "$xx"},
+    {"name": "$xx$"},
+    # 63 bytes is the limit for role/DB names in Postgres
+    {"name": "x" * 63},
 ]

 TEST_DB_NAMES = [
@@ -74,6 +84,43 @@ TEST_DB_NAMES = [
        "name": "db name$x$",
        "owner": "role$x$",
    },
+    {
+        "name": "x",
+        "owner": "x",
+    },
+    {
+        "name": "xx",
+        "owner": "xx",
+    },
+    {
+        "name": "$x",
+        "owner": "$x",
+    },
+    {
+        "name": "x$",
+        "owner": "x$",
+    },
+    {
+        "name": "$x$",
+        "owner": "$x$",
+    },
+    {
+        "name": "xx$",
+        "owner": "xx$",
+    },
+    {
+        "name": "$xx",
+        "owner": "$xx",
+    },
+    {
+        "name": "$xx$",
+        "owner": "$xx$",
+    },
+    # 63 bytes is the limit for role/DB names in Postgres
+    {
+        "name": "x" * 63,
+        "owner": "x" * 63,
+    },
 ]


@@ -146,6 +193,10 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
    """
    Test that compute_ctl can create and work with databases and roles
    with special characters (whitespaces, %, tabs, etc.) in the name.
+    Also use `drop_subscriptions_before_start: true`. We do not actually
+    have any subscriptions in this test, so it should be no-op, but it
+    i) simulates the case when we create a second dev branch together with
+    a new project creation, and ii) just generally stresses more code paths.
    """
    env = neon_simple_env

@@ -159,6 +210,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
+                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": TEST_ROLE_NAMES,
                    "databases": TEST_DB_NAMES,
@@ -202,6 +254,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
+                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": [],
                    "databases": [],
Author	SHA1	Message	Date
Christian Schwarz	7b5e3d6d40	Revert "work around copy_to_user failure on reads" Not necessary as of `271b19bd3e` Reason why that commit fixes the issues is https://neondb.slack.com/archives/C08SXUSNFBP/p1747837985950309 > implemented a fix in the kernel module, no need for userspace to > pre-fault mmapped pages in; the reason for the EFAULT was that we were > holding a spinlock during copy_to_user, which disables preemption, which > in turn makes the page fault handler give up and cause copy_to_user to > fail; commit to be used is 271b19bd3e2de7777770ac6b8b1b1c94bb33830b > (edited) This reverts commit `7b818f8d64`.	2025-05-23 12:11:01 +02:00
Christian Schwarz	e086568e21	demote log levels to avoid flooding the logs	2025-05-23 12:10:53 +02:00
Heikki Linnakangas	7b818f8d64	work around copy_to_user failure on reads	2025-05-21 16:33:52 +03:00
Heikki Linnakangas	14fefd261f	add separate counters for ioctl read misses, and some debugging LOG messages	2025-05-21 01:33:58 +03:00
Heikki Linnakangas	01abd4afc5	fix address argument again	2025-05-21 01:33:43 +03:00
Heikki Linnakangas	c8541ad29f	fix bogus pointer	2025-05-21 00:20:56 +03:00
Heikki Linnakangas	eaad1db9f0	Add bespoken metrics for kernel module cache misses	2025-05-21 00:20:39 +03:00
Heikki Linnakangas	6ddcf68829	use correct request code for writes	2025-05-20 23:53:56 +03:00
Heikki Linnakangas	d701f8285c	hack permissions on /dev/clockcache_dev to allow access to everyone	2025-05-20 23:52:58 +03:00
Heikki Linnakangas	77082a0f63	Implement using the kernel module Enabled when you set "lfc_use_kernel_module=on" in postgresql.conf XXX: This compiles, but is 100% untested	2025-05-20 17:45:40 +03:00
Alexey Kondratov	e94acbc816	fix(compute_ctl): Dollar escaping and tests (#11969 ) ## Problem In the escaping path we were checking that `${tag}$` or `${outer_tag}$` are present in the string, but that's not enough, as original string surrounded by `$` can also form a 'tag', like `$x$xx$x$`, which is fine on it's own, but cannot be used in the string escaped with `$xx$`. ## Summary of changes Remove `$` from the checks, just check if `{tag}` or `{outer_tag}` are present. Add more test cases and change the catalog test to stress the `drop_subscriptions_before_start: true` path as well. Fixes https://github.com/neondatabase/cloud/issues/29198	2025-05-20 09:03:36 +00:00
Erik Grinaker	f4150614d0	pageserver: don't pass config to `PageHandler` (#11973 ) ## Problem The gRPC page service API will require decoupling the `PageHandler` from the libpq protocol implementation. As preparation for this, avoid passing in the entire server config to `PageHandler`, and instead explicitly pass in the relevant fields. Touches https://github.com/neondatabase/neon/issues/11728. ## Summary of changes * Change `PageHandler` to take a `GetVectoredConcurrentIo` instead of the entire config. * Change `IoConcurrency::spawn_from_conf` to take a `GetVectoredConcurrentIo`.	2025-05-19 15:47:40 +00:00
Erik Grinaker	38dbc5f67f	pageserver/page_api: add binary Protobuf descriptor (#11968 ) ## Problem A binary Protobuf schema descriptor can be used to expose an API reflection service, which in turn allows convenient usage of e.g. `grpcurl` against the gRPC server. Touches #11728. ## Summary of changes * Generate a binary schema descriptor as `pageserver_page_api::proto::FILE_DESCRIPTOR_SET`. * Opportunistically rename the Protobuf package from `page_service` to `page_api`.	2025-05-19 11:17:45 +00:00
Folke Behrens	3685ad606d	endpoint_storage: Fix metrics test by excluding assertion on macos (#11952 )	2025-05-19 10:56:03 +00:00
Ivan Efremov	76a7d37f7e	proxy: Drop cancellation ops if they don't fit into the queue (#11950 ) Add a redis ops batch size argument for proxy and remove timeouts by using try_send()	2025-05-19 10:10:55 +00:00
Erik Grinaker	cdb6479c8a	pageserver: add gRPC page service schema (#11815 ) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging.	2025-05-19 09:03:06 +00:00