rewrite with custom json serializer

remove locking from extract, use refcell instead
remove lasso from json logger, use field index for lookup
2026-02-09 05:30:37 +00:00 · 2025-05-18 13:41:43 +02:00 · 2025-05-17 22:14:26 +02:00 · 2025-05-17 22:14:26 +02:00 · 2025-05-17 22:14:26 +02:00 · 2025-05-17 22:14:26 +02:00
70 changed files with 1285 additions and 2010 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3898,16 +3898,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -4192,12 +4182,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -4302,7 +4286,6 @@ dependencies = [
 "enumset",
 "fail",
 "futures",
- "hashlink",
 "hex",
 "hex-literal",
 "http-utils",
@@ -4451,16 +4434,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "pageserver_page_api"
-version = "0.1.0"
-dependencies = [
- "prost 0.13.3",
- "tonic",
- "tonic-build",
- "workspace_hack",
-]
-
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -5232,6 +5205,7 @@ dependencies = [
 "rustls 0.23.18",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
+ "ryu",
 "scopeguard",
 "serde",
 "serde_json",
@@ -5255,7 +5229,6 @@ dependencies = [
 "tracing-log",
 "tracing-opentelemetry",
 "tracing-subscriber",
- "tracing-test",
 "tracing-utils",
 "try-lock",
 "typed-json",
@@ -7706,7 +7679,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -7720,27 +7692,6 @@ dependencies = [
 "tracing-serde",
 ]

-[[package]]
-name = "tracing-test"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
-dependencies = [
- "tracing-core",
- "tracing-subscriber",
- "tracing-test-macro",
-]
-
-[[package]]
-name = "tracing-test-macro"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
-dependencies = [
- "quote",
- "syn 2.0.100",
-]
-
 [[package]]
 name = "tracing-utils"
 version = "0.1.0"
@@ -8593,7 +8544,6 @@ dependencies = [
 "tracing",
 "tracing-core",
 "tracing-log",
- "tracing-subscriber",
 "url",
 "uuid",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,6 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
-    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -253,7 +252,6 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
-pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
 			 RelationGetRelationName(index));
 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
+	smgr_start_unlogged_build(index->rd_smgr);
 +#endif
 +
 	initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
 +#endif
 +
 	/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
 	}
 
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
+	smgr_end_unlogged_build(index->rd_smgr);
 +#endif
 +
 	/*
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -213,10 +213,8 @@ impl Escaping for PgIdent {

        // Find the first suitable tag that is not present in the string.
        // Postgres' max role/DB name length is 63 bytes, so even in the
-        // worst case it won't take long. Outer tag is always `tag + "x"`,
-        // so if `tag` is not present in the string, `outer_tag` is not
-        // present in the string either.
-        while self.contains(&tag.to_string()) {
+        // worst case it won't take long.
+        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
            tag += "x";
            outer_tag = tag.clone() + "x";
        }
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -71,14 +71,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
            ("name$$$", ("$x$name$$$$x$", "xx")),
            ("name$$$$", ("$x$name$$$$$x$", "xx")),
            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
-            ("x", ("$xx$x$xx$", "xxx")),
-            ("xx", ("$xxx$xx$xxx$", "xxxx")),
-            ("$x", ("$xx$$x$xx$", "xxx")),
-            ("x$", ("$xx$x$$xx$", "xxx")),
-            ("$x$", ("$xx$$x$$xx$", "xxx")),
-            ("xx$", ("$xxx$xx$$xxx$", "xxxx")),
-            ("$xx", ("$xxx$$xx$xxx$", "xxxx")),
-            ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
        ];

        for (input, expected) in test_cases {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -546,11 +546,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Falied to parse 'sampling_ratio'")?,
-            relsize_snapshot_cache_capacity: settings
-                .remove("relsize snapshot cache capacity")
-                .map(|x| x.parse::<usize>())
-                .transpose()
-                .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,8 +462,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        if var(REAL_S3_ENV).is_ok() {
            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
        }
-
-        #[cfg(target_os = "linux")]
        assert!(body.contains("process_threads"));
    }

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -235,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
    ScatteredLsn,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
@@ -491,8 +491,6 @@ pub struct TenantConfigToml {
    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
    /// that will get perf sampling for the tenant.
    pub sampling_ratio: Option<Ratio>,
-    /// Capacity of relsize snapshot cache (used by replicas).
-    pub relsize_snapshot_cache_capacity: usize,
 }

 pub mod defaults {
@@ -732,7 +730,6 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
-    pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
 }

 impl Default for TenantConfigToml {
@@ -790,7 +787,6 @@ impl Default for TenantConfigToml {
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
            sampling_ratio: None,
-            relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
        }
    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -630,8 +630,6 @@ pub struct TenantConfigPatch {
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub sampling_ratio: FieldPatch<Option<Ratio>>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
 }

 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -761,9 +759,6 @@ pub struct TenantConfig {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub sampling_ratio: Option<Option<Ratio>>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub relsize_snapshot_cache_capacity: Option<usize>,
 }

 impl TenantConfig {
@@ -809,7 +804,6 @@ impl TenantConfig {
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
            mut sampling_ratio,
-            mut relsize_snapshot_cache_capacity,
        } = self;

        patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -911,9 +905,6 @@ impl TenantConfig {
            .gc_compaction_ratio_percent
            .apply(&mut gc_compaction_ratio_percent);
        patch.sampling_ratio.apply(&mut sampling_ratio);
-        patch
-            .relsize_snapshot_cache_capacity
-            .apply(&mut relsize_snapshot_cache_capacity);

        Ok(Self {
            checkpoint_distance,
@@ -953,7 +944,6 @@ impl TenantConfig {
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
            sampling_ratio,
-            relsize_snapshot_cache_capacity,
        })
    }

@@ -1062,9 +1052,6 @@ impl TenantConfig {
                .gc_compaction_ratio_percent
                .unwrap_or(global_conf.gc_compaction_ratio_percent),
            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
-            relsize_snapshot_cache_capacity: self
-                .relsize_snapshot_cache_capacity
-                .unwrap_or(global_conf.relsize_snapshot_cache_capacity),
        }
    }
 }
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,12 +1,14 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
+use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;

 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{TryStreamExt, future, ready};
+use parking_lot::Mutex;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use serde::{Deserialize, Serialize};
@@ -14,6 +16,7 @@ use tokio::sync::mpsc;

 use crate::codec::{BackendMessages, FrontendMessage};
 use crate::config::{Host, SslMode};
+use crate::connection::{Request, RequestMessages};
 use crate::query::RowStream;
 use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, Type};
@@ -23,43 +26,19 @@ use crate::{
 };

 pub struct Responses {
-    /// new messages from conn
    receiver: mpsc::Receiver<BackendMessages>,
-    /// current batch of messages
    cur: BackendMessages,
-    /// number of total queries sent.
-    waiting: usize,
-    /// number of ReadyForQuery messages received.
-    received: usize,
 }

 impl Responses {
    pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
        loop {
-            // get the next saved message
-            if let Some(message) = self.cur.next().map_err(Error::parse)? {
-                let received = self.received;
-
-                // increase the query head if this is the last message.
-                if let Message::ReadyForQuery(_) = message {
-                    self.received += 1;
-                }
-
-                // check if the client has skipped this query.
-                if received + 1 < self.waiting {
-                    // grab the next message.
-                    continue;
-                }
-
-                // convenience: turn the error messaage into a proper error.
-                let res = match message {
-                    Message::ErrorResponse(body) => Err(Error::db(body)),
-                    message => Ok(message),
-                };
-                return Poll::Ready(res);
+            match self.cur.next().map_err(Error::parse)? {
+                Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))),
+                Some(message) => return Poll::Ready(Ok(message)),
+                None => {}
            }

-            // get the next back of messages.
            match ready!(self.receiver.poll_recv(cx)) {
                Some(messages) => self.cur = messages,
                None => return Poll::Ready(Err(Error::closed())),
@@ -86,28 +65,33 @@ pub(crate) struct CachedTypeInfo {
 }

 pub struct InnerClient {
-    sender: mpsc::UnboundedSender<FrontendMessage>,
-    responses: Responses,
+    sender: mpsc::UnboundedSender<Request>,

    /// A buffer to use when writing out postgres commands.
-    buffer: BytesMut,
+    buffer: Mutex<BytesMut>,
 }

 impl InnerClient {
-    pub fn send(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> {
-        self.sender.send(messages).map_err(|_| Error::closed())?;
-        self.responses.waiting += 1;
-        Ok(&mut self.responses)
+    pub fn send(&self, messages: RequestMessages) -> Result<Responses, Error> {
+        let (sender, receiver) = mpsc::channel(1);
+        let request = Request { messages, sender };
+        self.sender.send(request).map_err(|_| Error::closed())?;
+
+        Ok(Responses {
+            receiver,
+            cur: BackendMessages::empty(),
+        })
    }

    /// Call the given function with a buffer to be used when writing out
    /// postgres commands.
-    pub fn with_buf<F, R>(&mut self, f: F) -> R
+    pub fn with_buf<F, R>(&self, f: F) -> R
    where
        F: FnOnce(&mut BytesMut) -> R,
    {
-        let r = f(&mut self.buffer);
-        self.buffer.clear();
+        let mut buffer = self.buffer.lock();
+        let r = f(&mut buffer);
+        buffer.clear();
        r
    }
 }
@@ -125,7 +109,7 @@ pub struct SocketConfig {
 /// The client is one half of what is returned when a connection is established. Users interact with the database
 /// through this client object.
 pub struct Client {
-    inner: InnerClient,
+    inner: Arc<InnerClient>,
    cached_typeinfo: CachedTypeInfo,

    socket_config: SocketConfig,
@@ -134,39 +118,19 @@ pub struct Client {
    secret_key: i32,
 }

-impl Drop for Client {
-    fn drop(&mut self) {
-        if let Some(stmt) = self.cached_typeinfo.typeinfo.take() {
-            let buf = self.inner.with_buf(|buf| {
-                frontend::close(b'S', stmt.name(), buf).unwrap();
-                frontend::sync(buf);
-                buf.split().freeze()
-            });
-            let _ = self.inner.send(FrontendMessage::Raw(buf));
-        }
-    }
-}
-
 impl Client {
    pub(crate) fn new(
-        sender: mpsc::UnboundedSender<FrontendMessage>,
-        receiver: mpsc::Receiver<BackendMessages>,
+        sender: mpsc::UnboundedSender<Request>,
        socket_config: SocketConfig,
        ssl_mode: SslMode,
        process_id: i32,
        secret_key: i32,
    ) -> Client {
        Client {
-            inner: InnerClient {
+            inner: Arc::new(InnerClient {
                sender,
-                responses: Responses {
-                    receiver,
-                    cur: BackendMessages::empty(),
-                    waiting: 0,
-                    received: 0,
-                },
                buffer: Default::default(),
-            },
+            }),
            cached_typeinfo: Default::default(),

            socket_config,
@@ -181,23 +145,19 @@ impl Client {
        self.process_id
    }

-    pub(crate) fn inner(&mut self) -> &mut InnerClient {
-        &mut self.inner
+    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
+        &self.inner
    }

    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
    /// to save a roundtrip
-    pub async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
        I::IntoIter: ExactSizeIterator,
    {
-        query::query_txt(&mut self.inner, statement, params).await
+        query::query_txt(&self.inner, statement, params).await
    }

    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
@@ -213,14 +173,11 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn simple_query(&mut self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
+    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
        self.simple_query_raw(query).await?.try_collect().await
    }

-    pub(crate) async fn simple_query_raw(
-        &mut self,
-        query: &str,
-    ) -> Result<SimpleQueryStream, Error> {
+    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
        simple_query::simple_query(self.inner(), query).await
    }

@@ -234,7 +191,7 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn batch_execute(&mut self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
        simple_query::batch_execute(self.inner(), query).await
    }

@@ -251,7 +208,7 @@ impl Client {
    /// The transaction will roll back by default - use the `commit` method to commit it.
    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
        struct RollbackIfNotDone<'me> {
-            client: &'me mut Client,
+            client: &'me Client,
            done: bool,
        }

@@ -265,7 +222,10 @@ impl Client {
                    frontend::query("ROLLBACK", buf).unwrap();
                    buf.split().freeze()
                });
-                let _ = self.client.inner().send(FrontendMessage::Raw(buf));
+                let _ = self
+                    .client
+                    .inner()
+                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
            }
        }

@@ -279,7 +239,7 @@ impl Client {
                client: self,
                done: false,
            };
-            cleaner.client.batch_execute("BEGIN").await?;
+            self.batch_execute("BEGIN").await?;
            cleaner.done = true;
        }

@@ -307,7 +267,7 @@ impl Client {

    /// Query for type information
    pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
-        crate::prepare::get_type(&mut self.inner, &mut self.cached_typeinfo, oid).await
+        crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
    }

    /// Determines if the connection to the server has already closed.
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -1,16 +1,21 @@
 use std::io;

-use bytes::{Bytes, BytesMut};
+use bytes::{Buf, Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend;
+use postgres_protocol2::message::frontend::CopyData;
 use tokio_util::codec::{Decoder, Encoder};

 pub enum FrontendMessage {
    Raw(Bytes),
+    CopyData(CopyData<Box<dyn Buf + Send>>),
 }

 pub enum BackendMessage {
-    Normal { messages: BackendMessages },
+    Normal {
+        messages: BackendMessages,
+        request_complete: bool,
+    },
    Async(backend::Message),
 }

@@ -39,6 +44,7 @@ impl Encoder<FrontendMessage> for PostgresCodec {
    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
        match item {
            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
+            FrontendMessage::CopyData(data) => data.write(dst),
        }

        Ok(())
@@ -51,6 +57,7 @@ impl Decoder for PostgresCodec {

    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
        let mut idx = 0;
+        let mut request_complete = false;

        while let Some(header) = backend::Header::parse(&src[idx..])? {
            let len = header.len() as usize + 1;
@@ -75,6 +82,7 @@ impl Decoder for PostgresCodec {
            idx += len;

            if header.tag() == backend::READY_FOR_QUERY_TAG {
+                request_complete = true;
                break;
            }
        }
@@ -84,6 +92,7 @@ impl Decoder for PostgresCodec {
        } else {
            Ok(Some(BackendMessage::Normal {
                messages: BackendMessages(src.split_to(idx)),
+                request_complete,
            }))
        }
    }
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -59,11 +59,9 @@ where
        connect_timeout: config.connect_timeout,
    };

-    let (client_tx, conn_rx) = mpsc::unbounded_channel();
-    let (conn_tx, client_rx) = mpsc::channel(4);
+    let (sender, receiver) = mpsc::unbounded_channel();
    let client = Client::new(
-        client_tx,
-        client_rx,
+        sender,
        socket_config,
        config.ssl_mode,
        process_id,
@@ -76,7 +74,7 @@ where
        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
        .collect();

-    let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);
+    let connection = Connection::new(stream, delayed, parameters, receiver);

    Ok((client, connection))
 }
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -4,6 +4,7 @@ use std::pin::Pin;
 use std::task::{Context, Poll};

 use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, Stream, ready};
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
@@ -18,12 +19,30 @@ use crate::error::DbError;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::{AsyncMessage, Error, Notification};

+pub enum RequestMessages {
+    Single(FrontendMessage),
+}
+
+pub struct Request {
+    pub messages: RequestMessages,
+    pub sender: mpsc::Sender<BackendMessages>,
+}
+
+pub struct Response {
+    sender: PollSender<BackendMessages>,
+}
+
 #[derive(PartialEq, Debug)]
 enum State {
    Active,
    Closing,
 }

+enum WriteReady {
+    Terminating,
+    WaitingOnRead,
+}
+
 /// A connection to a PostgreSQL database.
 ///
 /// This is one half of what is returned when a new connection is established. It performs the actual IO with the
@@ -37,11 +56,9 @@ pub struct Connection<S, T> {
    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
    /// HACK: we need this in the Neon Proxy to forward params.
    pub parameters: HashMap<String, String>,
-
-    sender: PollSender<BackendMessages>,
-    receiver: mpsc::UnboundedReceiver<FrontendMessage>,
-
+    receiver: mpsc::UnboundedReceiver<Request>,
    pending_responses: VecDeque<BackendMessage>,
+    responses: VecDeque<Response>,
    state: State,
 }

@@ -54,15 +71,14 @@ where
        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
        pending_responses: VecDeque<BackendMessage>,
        parameters: HashMap<String, String>,
-        sender: mpsc::Sender<BackendMessages>,
-        receiver: mpsc::UnboundedReceiver<FrontendMessage>,
+        receiver: mpsc::UnboundedReceiver<Request>,
    ) -> Connection<S, T> {
        Connection {
            stream,
            parameters,
-            sender: PollSender::new(sender),
            receiver,
            pending_responses,
+            responses: VecDeque::new(),
            state: State::Active,
        }
    }
@@ -94,7 +110,7 @@ where
                }
            };

-            let messages = match message {
+            let (mut messages, request_complete) = match message {
                BackendMessage::Async(Message::NoticeResponse(body)) => {
                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
@@ -115,19 +131,41 @@ where
                    continue;
                }
                BackendMessage::Async(_) => unreachable!(),
-                BackendMessage::Normal { messages } => messages,
+                BackendMessage::Normal {
+                    messages,
+                    request_complete,
+                } => (messages, request_complete),
            };

-            match self.sender.poll_reserve(cx) {
+            let mut response = match self.responses.pop_front() {
+                Some(response) => response,
+                None => match messages.next().map_err(Error::parse)? {
+                    Some(Message::ErrorResponse(error)) => {
+                        return Poll::Ready(Err(Error::db(error)));
+                    }
+                    _ => return Poll::Ready(Err(Error::unexpected_message())),
+                },
+            };
+
+            match response.sender.poll_reserve(cx) {
                Poll::Ready(Ok(())) => {
-                    let _ = self.sender.send_item(messages);
+                    let _ = response.sender.send_item(messages);
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
                }
                Poll::Ready(Err(_)) => {
-                    return Poll::Ready(Err(Error::closed()));
+                    // we need to keep paging through the rest of the messages even if the receiver's hung up
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
                }
                Poll::Pending => {
-                    self.pending_responses
-                        .push_back(BackendMessage::Normal { messages });
+                    self.responses.push_front(response);
+                    self.pending_responses.push_back(BackendMessage::Normal {
+                        messages,
+                        request_complete,
+                    });
                    trace!("poll_read: waiting on sender");
                    return Poll::Pending;
                }
@@ -136,7 +174,7 @@ where
    }

    /// Fetch the next client request and enqueue the response sender.
-    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<FrontendMessage>> {
+    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
        if self.receiver.is_closed() {
            return Poll::Ready(None);
        }
@@ -144,7 +182,10 @@ where
        match self.receiver.poll_recv(cx) {
            Poll::Ready(Some(request)) => {
                trace!("polled new request");
-                Poll::Ready(Some(request))
+                self.responses.push_back(Response {
+                    sender: PollSender::new(request.sender),
+                });
+                Poll::Ready(Some(request.messages))
            }
            Poll::Ready(None) => Poll::Ready(None),
            Poll::Pending => Poll::Pending,
@@ -153,7 +194,7 @@ where

    /// Process client requests and write them to the postgres connection, flushing if necessary.
    /// client -> postgres
-    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<WriteReady, Error>> {
        loop {
            if Pin::new(&mut self.stream)
                .poll_ready(cx)
@@ -168,14 +209,14 @@ where

            match self.poll_request(cx) {
                // send the message to postgres
-                Poll::Ready(Some(request)) => {
+                Poll::Ready(Some(RequestMessages::Single(request))) => {
                    Pin::new(&mut self.stream)
                        .start_send(request)
                        .map_err(Error::io)?;
                }
                // No more messages from the client, and no more responses to wait for.
                // Send a terminate message to postgres
-                Poll::Ready(None) => {
+                Poll::Ready(None) if self.responses.is_empty() => {
                    trace!("poll_write: at eof, terminating");
                    let mut request = BytesMut::new();
                    frontend::terminate(&mut request);
@@ -187,7 +228,16 @@ where

                    trace!("poll_write: sent eof, closing");
                    trace!("poll_write: done");
-                    return Poll::Ready(Ok(()));
+                    return Poll::Ready(Ok(WriteReady::Terminating));
+                }
+                // No more messages from the client, but there are still some responses to wait for.
+                Poll::Ready(None) => {
+                    trace!(
+                        "poll_write: at eof, pending responses {}",
+                        self.responses.len()
+                    );
+                    ready!(self.poll_flush(cx))?;
+                    return Poll::Ready(Ok(WriteReady::WaitingOnRead));
                }
                // Still waiting for a message from the client.
                Poll::Pending => {
@@ -248,7 +298,7 @@ where
            // if the state is still active, try read from and write to postgres.
            let message = self.poll_read(cx)?;
            let closing = self.poll_write(cx)?;
-            if let Poll::Ready(()) = closing {
+            if let Poll::Ready(WriteReady::Terminating) = closing {
                self.state = State::Closing;
            }

--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -86,27 +86,6 @@ pub struct DbError {
 }

 impl DbError {
-    pub fn new_test_error(code: SqlState, message: String) -> Self {
-        DbError {
-            severity: "ERROR".to_string(),
-            parsed_severity: Some(Severity::Error),
-            code,
-            message,
-            detail: None,
-            hint: None,
-            position: None,
-            where_: None,
-            schema: None,
-            table: None,
-            column: None,
-            datatype: None,
-            constraint: None,
-            file: None,
-            line: None,
-            routine: None,
-        }
-    }
-
    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
        let mut severity = None;
        let mut parsed_severity = None;
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -15,7 +15,7 @@ mod private {
 /// This trait is "sealed", and cannot be implemented outside of this crate.
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -28,7 +28,7 @@ pub trait GenericClient: private::Sealed {
 impl private::Sealed for Client {}

 impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -46,7 +46,7 @@ impl GenericClient for Client {
 impl private::Sealed for Transaction<'_> {}

 impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,5 +1,6 @@
 use std::future::Future;
 use std::pin::Pin;
+use std::sync::Arc;

 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
@@ -10,6 +11,7 @@ use tracing::debug;

 use crate::client::{CachedTypeInfo, InnerClient};
 use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
 use crate::types::{Kind, Oid, Type};
 use crate::{Column, Error, Statement, query, slice_iter};

@@ -22,13 +24,13 @@ WHERE t.oid = $1
 ";

 async fn prepare_typecheck(
-    client: &mut InnerClient,
+    client: &Arc<InnerClient>,
    name: &'static str,
    query: &str,
    types: &[Type],
 ) -> Result<Statement, Error> {
    let buf = encode(client, name, query, types)?;
-    let responses = client.send(FrontendMessage::Raw(buf))?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    match responses.next().await? {
        Message::ParseComplete => {}
@@ -63,15 +65,10 @@ async fn prepare_typecheck(
        }
    }

-    Ok(Statement::new(name, parameters, columns))
+    Ok(Statement::new(client, name, parameters, columns))
 }

-fn encode(
-    client: &mut InnerClient,
-    name: &str,
-    query: &str,
-    types: &[Type],
-) -> Result<Bytes, Error> {
+fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
    if types.is_empty() {
        debug!("preparing query {}: {}", name, query);
    } else {
@@ -87,7 +84,7 @@ fn encode(
 }

 pub async fn get_type(
-    client: &mut InnerClient,
+    client: &Arc<InnerClient>,
    typecache: &mut CachedTypeInfo,
    oid: Oid,
 ) -> Result<Type, Error> {
@@ -142,7 +139,7 @@ pub async fn get_type(
 }

 fn get_type_rec<'a>(
-    client: &'a mut InnerClient,
+    client: &'a Arc<InnerClient>,
    typecache: &'a mut CachedTypeInfo,
    oid: Oid,
 ) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
@@ -150,7 +147,7 @@ fn get_type_rec<'a>(
 }

 async fn typeinfo_statement(
-    client: &mut InnerClient,
+    client: &Arc<InnerClient>,
    typecache: &mut CachedTypeInfo,
 ) -> Result<Statement, Error> {
    if let Some(stmt) = &typecache.typeinfo {
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -1,10 +1,13 @@
 use std::fmt;
+use std::marker::PhantomPinned;
 use std::pin::Pin;
+use std::sync::Arc;
 use std::task::{Context, Poll};

 use bytes::{BufMut, Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
+use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use postgres_types2::{Format, ToSql, Type};
@@ -12,6 +15,7 @@ use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
 use crate::types::IsNull;
 use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};

@@ -24,7 +28,7 @@ impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
 }

 pub async fn query<'a, I>(
-    client: &mut InnerClient,
+    client: &InnerClient,
    statement: Statement,
    params: I,
 ) -> Result<RowStream, Error>
@@ -45,19 +49,20 @@ where
    };
    let responses = start(client, buf).await?;
    Ok(RowStream {
-        responses,
        statement,
+        responses,
        command_tag: None,
        status: ReadyForQueryStatus::Unknown,
        output_format: Format::Binary,
+        _p: PhantomPinned,
    })
 }

-pub async fn query_txt<'a, S, I>(
-    client: &'a mut InnerClient,
+pub async fn query_txt<S, I>(
+    client: &Arc<InnerClient>,
    query: &str,
    params: I,
-) -> Result<RowStream<'a>, Error>
+) -> Result<RowStream, Error>
 where
    S: AsRef<str>,
    I: IntoIterator<Item = Option<S>>,
@@ -104,7 +109,7 @@ where
    })?;

    // now read the responses
-    let responses = client.send(FrontendMessage::Raw(buf))?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    match responses.next().await? {
        Message::ParseComplete => {}
@@ -145,16 +150,17 @@ where
    }

    Ok(RowStream {
-        responses,
        statement: Statement::new_anonymous(parameters, columns),
+        responses,
        command_tag: None,
        status: ReadyForQueryStatus::Unknown,
        output_format: Format::Text,
+        _p: PhantomPinned,
    })
 }

-async fn start(client: &mut InnerClient, buf: Bytes) -> Result<&mut Responses, Error> {
-    let responses = client.send(FrontendMessage::Raw(buf))?;
+async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    match responses.next().await? {
        Message::BindComplete => {}
@@ -164,11 +170,7 @@ async fn start(client: &mut InnerClient, buf: Bytes) -> Result<&mut Responses, E
    Ok(responses)
 }

-pub fn encode<'a, I>(
-    client: &mut InnerClient,
-    statement: &Statement,
-    params: I,
-) -> Result<Bytes, Error>
+pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
 where
    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
    I::IntoIter: ExactSizeIterator,
@@ -232,37 +234,41 @@ where
    }
 }

-/// A stream of table rows.
-pub struct RowStream<'a> {
-    responses: &'a mut Responses,
-    output_format: Format,
-    pub statement: Statement,
-    pub command_tag: Option<String>,
-    pub status: ReadyForQueryStatus,
+pin_project! {
+    /// A stream of table rows.
+    pub struct RowStream {
+        statement: Statement,
+        responses: Responses,
+        command_tag: Option<String>,
+        output_format: Format,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
 }

-impl Stream for RowStream<'_> {
+impl Stream for RowStream {
    type Item = Result<Row, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        let this = self.get_mut();
+        let this = self.project();
        loop {
            match ready!(this.responses.poll_next(cx)?) {
                Message::DataRow(body) => {
                    return Poll::Ready(Some(Ok(Row::new(
                        this.statement.clone(),
                        body,
-                        this.output_format,
+                        *this.output_format,
                    )?)));
                }
                Message::EmptyQueryResponse | Message::PortalSuspended => {}
                Message::CommandComplete(body) => {
                    if let Ok(tag) = body.tag() {
-                        this.command_tag = Some(tag.to_string());
+                        *this.command_tag = Some(tag.to_string());
                    }
                }
                Message::ReadyForQuery(status) => {
-                    this.status = status.into();
+                    *this.status = status.into();
                    return Poll::Ready(None);
                }
                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
@@ -270,3 +276,24 @@ impl Stream for RowStream<'_> {
        }
    }
 }
+
+impl RowStream {
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[Column] {
+        self.statement.columns()
+    }
+
+    /// Returns the command tag of this query.
+    ///
+    /// This is only available after the stream has been exhausted.
+    pub fn command_tag(&self) -> Option<String> {
+        self.command_tag.clone()
+    }
+
+    /// Returns if the connection is ready for querying, with the status of the connection.
+    ///
+    /// This might be available only after the stream has been exhausted.
+    pub fn ready_status(&self) -> ReadyForQueryStatus {
+        self.status
+    }
+}
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -1,3 +1,4 @@
+use std::marker::PhantomPinned;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -12,6 +13,7 @@ use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
 use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};

 /// Information about a column of a single query row.
@@ -31,30 +33,28 @@ impl SimpleColumn {
    }
 }

-pub async fn simple_query<'a>(
-    client: &'a mut InnerClient,
-    query: &str,
-) -> Result<SimpleQueryStream<'a>, Error> {
+pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
    debug!("executing simple query: {}", query);

    let buf = encode(client, query)?;
-    let responses = client.send(FrontendMessage::Raw(buf))?;
+    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    Ok(SimpleQueryStream {
        responses,
        columns: None,
        status: ReadyForQueryStatus::Unknown,
+        _p: PhantomPinned,
    })
 }

 pub async fn batch_execute(
-    client: &mut InnerClient,
+    client: &InnerClient,
    query: &str,
 ) -> Result<ReadyForQueryStatus, Error> {
    debug!("executing statement batch: {}", query);

    let buf = encode(client, query)?;
-    let responses = client.send(FrontendMessage::Raw(buf))?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    loop {
        match responses.next().await? {
@@ -68,7 +68,7 @@ pub async fn batch_execute(
    }
 }

-pub(crate) fn encode(client: &mut InnerClient, query: &str) -> Result<Bytes, Error> {
+pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
    client.with_buf(|buf| {
        frontend::query(query, buf).map_err(Error::encode)?;
        Ok(buf.split().freeze())
@@ -77,14 +77,16 @@ pub(crate) fn encode(client: &mut InnerClient, query: &str) -> Result<Bytes, Err

 pin_project! {
    /// A stream of simple query results.
-    pub struct SimpleQueryStream<'a> {
-        responses: &'a mut Responses,
+    pub struct SimpleQueryStream {
+        responses: Responses,
        columns: Option<Arc<[SimpleColumn]>>,
        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
    }
 }

-impl SimpleQueryStream<'_> {
+impl SimpleQueryStream {
    /// Returns if the connection is ready for querying, with the status of the connection.
    ///
    /// This might be available only after the stream has been exhausted.
@@ -93,7 +95,7 @@ impl SimpleQueryStream<'_> {
    }
 }

-impl Stream for SimpleQueryStream<'_> {
+impl Stream for SimpleQueryStream {
    type Item = Result<SimpleQueryMessage, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -1,16 +1,35 @@
 use std::fmt;
-use std::sync::Arc;
+use std::sync::{Arc, Weak};

-use crate::types::Type;
 use postgres_protocol2::Oid;
 use postgres_protocol2::message::backend::Field;
+use postgres_protocol2::message::frontend;
+
+use crate::client::InnerClient;
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::Type;

 struct StatementInner {
+    client: Weak<InnerClient>,
    name: &'static str,
    params: Vec<Type>,
    columns: Vec<Column>,
 }

+impl Drop for StatementInner {
+    fn drop(&mut self) {
+        if let Some(client) = self.client.upgrade() {
+            let buf = client.with_buf(|buf| {
+                frontend::close(b'S', self.name, buf).unwrap();
+                frontend::sync(buf);
+                buf.split().freeze()
+            });
+            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+        }
+    }
+}
+
 /// A prepared statement.
 ///
 /// Prepared statements can only be used with the connection that created them.
@@ -18,8 +37,14 @@ struct StatementInner {
 pub struct Statement(Arc<StatementInner>);

 impl Statement {
-    pub(crate) fn new(name: &'static str, params: Vec<Type>, columns: Vec<Column>) -> Statement {
+    pub(crate) fn new(
+        inner: &Arc<InnerClient>,
+        name: &'static str,
+        params: Vec<Type>,
+        columns: Vec<Column>,
+    ) -> Statement {
        Statement(Arc::new(StatementInner {
+            client: Arc::downgrade(inner),
            name,
            params,
            columns,
@@ -28,6 +53,7 @@ impl Statement {

    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
        Statement(Arc::new(StatementInner {
+            client: Weak::new(),
            name: "<anonymous>",
            params,
            columns,
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,6 +1,7 @@
 use postgres_protocol2::message::frontend;

 use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
 use crate::query::RowStream;
 use crate::{CancelToken, Client, Error, ReadyForQueryStatus};

@@ -23,7 +24,10 @@ impl Drop for Transaction<'_> {
            frontend::query("ROLLBACK", buf).unwrap();
            buf.split().freeze()
        });
-        let _ = self.client.inner().send(FrontendMessage::Raw(buf));
+        let _ = self
+            .client
+            .inner()
+            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
    }
 }

@@ -50,11 +54,7 @@ impl<'a> Transaction<'a> {
    }

    /// Like `Client::query_raw_txt`.
-    pub async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -30,7 +30,6 @@ crc32c.workspace = true
 either.workspace = true
 fail.workspace = true
 futures.workspace = true
-hashlink.workspace = true
 hex.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -1,13 +0,0 @@
-[package]
-name = "pageserver_page_api"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-prost.workspace = true
-tonic.workspace = true
-workspace_hack.workspace = true
-
-[build-dependencies]
-tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -1,13 +0,0 @@
-use std::env;
-use std::path::PathBuf;
-
-/// Generates Rust code from .proto Protobuf schemas, along with a binary file
-/// descriptor set for Protobuf schema reflection.
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
-    tonic_build::configure()
-        .bytes(["."])
-        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
-        .compile_protos(&["proto/page_service.proto"], &["proto"])
-        .map_err(|err| err.into())
-}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -1,233 +0,0 @@
-// Page service, presented by pageservers for computes.
-//
-// This is the compute read path. It primarily serves page versions at given
-// LSNs, but also base backups, SLRU segments, and relation metadata.
-//
-// EXPERIMENTAL: this is still under development and subject to change.
-//
-// Request metadata headers:
-// - authorization: JWT token ("Bearer <token>"), if auth is enabled
-// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
-// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
-// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
-//
-// The service can be accessed via e.g. grpcurl:
-//
-//    ```
-//    grpcurl \
-//      -plaintext \
-//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
-//      -H "neon-shard-id: 0b10" \
-//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
-//      -H "authorization: Bearer $JWT" \
-//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
-//      localhost:51051 page_api.PageService/CheckRelExists
-//    ```
-//
-// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
-// However, this will require reconnecting when changing modes.
-//
-// TODO: write implementation guidance on
-// - Health checks
-// - Tracing, OpenTelemetry
-// - Compression
-
-syntax = "proto3";
-package page_api;
-
-service PageService {
-  // Returns whether a relation exists.
-  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
-
-  // Fetches a base backup.
-  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
-
-  // Returns the total size of a database, as # of bytes.
-  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
-
-  // Fetches pages.
-  //
-  // This is implemented as a bidirectional streaming RPC for performance. Unary
-  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
-  // authentication, and so on -- with streaming, we only pay these costs during
-  // the initial stream setup. This ~doubles throughput in benchmarks. Other
-  // RPCs use regular unary requests, since they are not as frequent and
-  // performance-critical, and this simplifies implementation.
-  //
-  // NB: a status response (e.g. errors) will terminate the stream. The stream
-  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
-  // Most errors are therefore sent as GetPageResponse.status instead.
-  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
-
-  // Returns the size of a relation, as # of blocks.
-  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
-
-  // Fetches an SLRU segment.
-  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
-}
-
-// The LSN a request should read at.
-message ReadLsn {
-  // The request's read LSN. Required.
-  uint64 request_lsn = 1;
-  // If given, the caller guarantees that the page has not been modified since
-  // this LSN. Must be smaller than or equal to request_lsn. This allows the
-  // Pageserver to serve an old page without waiting for the request LSN to
-  // arrive. Valid for all request types.
-  //
-  // It is undefined behaviour to make a request such that the page was, in
-  // fact, modified between request_lsn and not_modified_since_lsn. The
-  // Pageserver might detect it and return an error, or it might return the old
-  // page version or the new page version. Setting not_modified_since_lsn equal
-  // to request_lsn is always safe, but can lead to unnecessary waiting.
-  uint64 not_modified_since_lsn = 2;
-}
-
-// A relation identifier.
-message RelTag {
-    uint32 spc_oid = 1;
-    uint32 db_oid = 2;
-    uint32 rel_number = 3;
-    uint32 fork_number = 4;
-}
-
-// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
-// other shards will error.
-message CheckRelExistsRequest {
-  ReadLsn read_lsn = 1;
-  RelTag rel = 2;
-}
-
-message CheckRelExistsResponse {
-  bool exists = 1;
-}
-
-// Requests a base backup at a given LSN.
-message GetBaseBackupRequest {
-  // The LSN to fetch a base backup at.
-  ReadLsn read_lsn = 1;
-  // If true, logical replication slots will not be created.
-  bool replica = 2;
-}
-
-// Base backup response chunk, returned as an ordered stream.
-message GetBaseBackupResponseChunk {
-  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
-  // gRPC message size limit.
-  bytes chunk = 1;
-}
-
-// Requests the size of a database, as # of bytes. Only valid on shard 0, other
-// shards will error.
-message GetDbSizeRequest {
-  ReadLsn read_lsn = 1;
-  uint32 db_oid = 2;
-}
-
-message GetDbSizeResponse {
-  uint64 num_bytes = 1;
-}
-
-// Requests one or more pages.
-message GetPageRequest {
-  // A request ID. Will be included in the response. Should be unique for
-  // in-flight requests on the stream.
-  uint64 request_id = 1;
-  // The request class.
-  GetPageClass request_class = 2;
-  // The LSN to read at.
-  ReadLsn read_lsn = 3;
-  // The relation to read from.
-  RelTag rel = 4;
-  // Page numbers to read. Must belong to the remote shard.
-  //
-  // Multiple pages will be executed as a single batch by the Pageserver,
-  // amortizing layer access costs and parallelizing them. This may increase the
-  // latency of any individual request, but improves the overall latency and
-  // throughput of the batch as a whole.
-  //
-  // TODO: this causes an allocation in the common single-block case. The sender
-  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
-  // into a heap-allocated Vec. Consider optimizing this.
-  //
-  // TODO: we might be able to avoid a sort or something if we mandate that these
-  // are always in order. But we can't currenly rely on this on the server, because
-  // of compatibility with the libpq protocol handler.
-  repeated uint32 block_number = 5;
-}
-
-// A GetPageRequest class. Primarily intended for observability, but may also be
-// used for prioritization in the future.
-enum GetPageClass {
-  // Unknown class. For forwards compatibility: used when the client sends a
-  // class that the server doesn't know about.
-  GET_PAGE_CLASS_UNKNOWN = 0;
-  // A normal request. This is the default.
-  GET_PAGE_CLASS_NORMAL = 1;
-  // A prefetch request. NB: can only be classified on pg < 18.
-  GET_PAGE_CLASS_PREFETCH = 2;
-  // A background request (e.g. vacuum).
-  GET_PAGE_CLASS_BACKGROUND = 3;
-}
-
-// A GetPage response.
-//
-// A batch response will contain all of the requested pages. We could eagerly
-// emit individual pages as soon as they are ready, but on a readv() Postgres
-// holds buffer pool locks on all pages in the batch and we'll only return once
-// the entire batch is ready, so no one can make use of the individual pages.
-message GetPageResponse {
-  // The original request's ID.
-  uint64 request_id = 1;
-  // The response status code.
-  GetPageStatus status = 2;
-  // A string describing the status, if any.
-  string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status != OK.
-  repeated bytes page_image = 4;
-}
-
-// A GetPageResponse status code. Since we use a bidirectional stream, we don't
-// want to send errors as gRPC statuses, since this would terminate the stream.
-enum GetPageStatus {
-  // Unknown status. For forwards compatibility: used when the server sends a
-  // status code that the client doesn't know about.
-  GET_PAGE_STATUS_UNKNOWN = 0;
-  // The request was successful.
-  GET_PAGE_STATUS_OK = 1;
-  // The page did not exist. The tenant/timeline/shard has already been
-  // validated during stream setup.
-  GET_PAGE_STATUS_NOT_FOUND = 2;
-  // The request was invalid.
-  GET_PAGE_STATUS_INVALID = 3;
-  // The tenant is rate limited. Slow down and retry later.
-  GET_PAGE_STATUS_SLOW_DOWN = 4;
-  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
-  // layer download. This could free up the server task to process other
-  // requests while the layer download is in progress.
-}
-
-// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
-// shard 0, other shards will error.
-message GetRelSizeRequest {
-  ReadLsn read_lsn = 1;
-  RelTag rel = 2;
-}
-
-message GetRelSizeResponse {
-  uint32 num_blocks = 1;
-}
-
-// Requests an SLRU segment. Only valid on shard 0, other shards will error.
-message GetSlruSegmentRequest {
-  ReadLsn read_lsn = 1;
-  uint32 kind = 2;
-  uint32 segno = 3;
-}
-
-// Returns an SLRU segment.
-//
-// These are up 32 pages (256 KB), so we can send them as a single response.
-message GetSlruSegmentResponse {
-  bytes segment = 1;
-}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -1,19 +0,0 @@
-//! This crate provides the Pageserver's page API. It contains:
-//!
-//! * proto/page_service.proto: the Protobuf schema for the page API.
-//! * proto: auto-generated Protobuf types for gRPC.
-//!
-//! This crate is used by both the client and the server. Try to keep it slim.
-
-// Code generated by protobuf.
-pub mod proto {
-    tonic::include_proto!("page_api");
-
-    /// File descriptor set for Protobuf schema reflection. This allows using
-    /// e.g. grpcurl with the API.
-    pub const FILE_DESCRIPTOR_SET: &[u8] =
-        tonic::include_file_descriptor_set!("page_api_descriptor");
-
-    pub use page_service_client::PageServiceClient;
-    pub use page_service_server::{PageService, PageServiceServer};
-}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -144,7 +144,7 @@ where
        replica,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf.get_vectored_concurrent_io,
+            timeline.conf,
            timeline
                .gate
                .enter()
@@ -343,7 +343,7 @@ where
            // Gather non-relational files from object storage pages.
            let slru_partitions = self
                .timeline
-                .get_slru_keyspace(Version::at(self.lsn), self.ctx)
+                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
                .await?
                .partition(
                    self.timeline.get_shard_identity(),
@@ -378,7 +378,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -517,7 +517,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::at(self.lsn), self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -577,7 +577,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                .await?;

            if img.len()
@@ -631,7 +631,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -544,23 +544,6 @@ impl PageServerConf {
                    ratio.numerator, ratio.denominator
                )
            );
-
-            let url = Url::parse(&tracing_config.export_config.endpoint)
-                .map_err(anyhow::Error::msg)
-                .with_context(|| {
-                    format!(
-                        "tracing endpoint URL is invalid : {}",
-                        tracing_config.export_config.endpoint
-                    )
-                })?;
-
-            ensure!(
-                url.scheme() == "http" || url.scheme() == "https",
-                format!(
-                    "tracing endpoint URL must start with http:// or https://: {}",
-                    tracing_config.export_config.endpoint
-                )
-            );
        }

        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
@@ -677,25 +660,4 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect("parse_and_validate");
    }
-
-    #[test]
-    fn test_config_tracing_endpoint_is_invalid() {
-        let input = r#"
-            control_plane_api = "http://localhost:6666"
-
-            [tracing]
-
-            sampling_ratio = { numerator = 1, denominator = 0 }
-
-            [tracing.export_config]
-            endpoint = "localhost:4317"
-            protocol = "http-binary"
-            timeout = "1ms"
-        "#;
-        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
-            .expect("config has valid fields");
-        let workdir = Utf8PathBuf::from("/nonexistent");
-        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
-            .expect_err("parse_and_validate should fail for endpoint without scheme");
-    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
    // actually trimmed data to), which can pass each other when PITR is changed.
    let min_readable_lsn = std::cmp::max(
-        timeline.get_gc_cutoff_lsn().unwrap_or_default(),
+        timeline.get_gc_cutoff_lsn(),
        *timeline.get_applied_gc_cutoff_lsn(),
    );

@@ -3199,7 +3199,7 @@ async fn list_aux_files(
            .await?;

    let io_concurrency = IoConcurrency::spawn_from_conf(
-        state.conf.get_vectored_concurrent_io,
+        state.conf,
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -843,50 +843,23 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
-        "pageserver_relsize_latest_cache_entries",
-        "Number of entries in the latest relation size cache",
+        "pageserver_relsize_cache_entries",
+        "Number of entries in the relation size cache",
    )
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
+        .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
-        "pageserver_relsize_latest_cache_hits",
-        "Latest relation size cache hits",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_relsize_latest_cache_misses",
-        "Relation size latest cache misses",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_relsize_snapshot_cache_entries",
-        "Number of entries in the pitr relation size cache",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_relsize_snapshot_cache_hits",
-        "Pitr relation size cache hits",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_relsize_snapshot_cache_misses",
-        "Relation size snapshot cache misses",
+        "pageserver_relsize_cache_misses",
+        "Relation size cache misses",
    )
    .expect("failed to define a metric")
 });
@@ -1066,15 +1039,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

-pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_tenant_offloaded_timelines",
-        "Number of offloaded timelines of a tenant",
-        &["tenant_id", "shard_id"]
-    )
-    .expect("Failed to register pageserver_tenant_offloaded_timelines metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -3560,14 +3524,11 @@ impl TimelineMetrics {
 }

 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
-    let tid = tenant_shard_id.tenant_id.to_string();
-    let shard_id = tenant_shard_id.shard_slug().to_string();
-
    // Only shard zero deals in synthetic sizes
    if tenant_shard_id.is_shard_zero() {
+        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
-    let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);

    tenant_throttling::remove_tenant_metrics(tenant_shard_id);

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
-    GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
@@ -62,7 +62,7 @@ use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
    SmgrOpTimer, TimelineMetrics,
 };
-use crate::pgdatadir_mapping::{LsnRange, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -331,10 +331,10 @@ async fn page_service_conn_main(
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(
+        conf,
        tenant_manager,
        auth,
        pipelining_config,
-        conf.get_vectored_concurrent_io,
        perf_span_fields,
        connection_ctx,
        cancel.clone(),
@@ -371,6 +371,7 @@ async fn page_service_conn_main(
 }

 struct PageServerHandler {
+    conf: &'static PageServerConf,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -388,7 +389,6 @@ struct PageServerHandler {
    timeline_handles: Option<TimelineHandles>,

    pipelining_config: PageServicePipeliningConfig,
-    get_vectored_concurrent_io: GetVectoredConcurrentIo,

    gate_guard: GateGuard,
 }
@@ -642,7 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
    timer: SmgrOpTimer,
-    lsn_range: LsnRange,
+    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }

@@ -764,12 +764,12 @@ impl BatchedFeMessage {
                match batching_strategy {
                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
                        if let Some(last_in_batch) = accum_pages.last() {
-                            if last_in_batch.lsn_range.effective_lsn
-                                != this_pages[0].lsn_range.effective_lsn
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
                            {
                                trace!(
-                                    accum_lsn = %last_in_batch.lsn_range.effective_lsn,
-                                    this_lsn = %this_pages[0].lsn_range.effective_lsn,
+                                    accum_lsn = %last_in_batch.effective_request_lsn,
+                                    this_lsn = %this_pages[0].effective_request_lsn,
                                    "stopping batching because LSN changed"
                                );

@@ -784,15 +784,15 @@ impl BatchedFeMessage {
                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
                            batched.req.rel == this_pages[0].req.rel
                                && batched.req.blkno == this_pages[0].req.blkno
-                                && batched.lsn_range.effective_lsn
-                                    != this_pages[0].lsn_range.effective_lsn
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
                        });

                        if same_page_different_lsn {
                            trace!(
                                rel=%this_pages[0].req.rel,
                                blkno=%this_pages[0].req.blkno,
-                                lsn=%this_pages[0].lsn_range.effective_lsn,
+                                lsn=%this_pages[0].effective_request_lsn,
                                "stopping batching because same page was requested at different LSNs"
                            );

@@ -844,16 +844,17 @@ impl BatchedFeMessage {
 impl PageServerHandler {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
+        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
-        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
    ) -> Self {
        PageServerHandler {
+            conf,
            auth,
            claims: None,
            connection_ctx,
@@ -861,7 +862,6 @@ impl PageServerHandler {
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
-            get_vectored_concurrent_io,
            gate_guard,
        }
    }
@@ -1158,7 +1158,7 @@ impl PageServerHandler {
                .await?;

                // We're holding the Handle
-                let effective_lsn = match Self::effective_request_lsn(
+                let effective_request_lsn = match Self::effective_request_lsn(
                    &shard,
                    shard.get_last_record_lsn(),
                    req.hdr.request_lsn,
@@ -1177,10 +1177,7 @@ impl PageServerHandler {
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
                        timer,
-                        lsn_range: LsnRange {
-                            effective_lsn,
-                            request_lsn: req.hdr.request_lsn
-                        },
+                        effective_request_lsn,
                        ctx,
                    }],
                    // The executor grabs the batch when it becomes idle.
@@ -1281,7 +1278,7 @@ impl PageServerHandler {
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all)]
-    async fn pagestream_handle_batched_message<IO>(
+    async fn pagesteam_handle_batched_message<IO>(
        &mut self,
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
@@ -1626,7 +1623,7 @@ impl PageServerHandler {
        }

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.get_vectored_concurrent_io,
+            self.conf,
            match self.gate_guard.try_clone() {
                Ok(guard) => guard,
                Err(_) => {
@@ -1736,7 +1733,7 @@ impl PageServerHandler {
            };

            let result = self
-                .pagestream_handle_batched_message(
+                .pagesteam_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
@@ -1912,7 +1909,7 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    self.pagestream_handle_batched_message(
+                    self.pagesteam_handle_batched_message(
                        pgb_writer,
                        batch,
                        io_concurrency.clone(),
@@ -2130,14 +2127,7 @@ impl PageServerHandler {
        .await?;

        let exists = timeline
-            .get_rel_exists(
-                req.rel,
-                Version::LsnRange(LsnRange {
-                    effective_lsn: lsn,
-                    request_lsn: req.hdr.request_lsn,
-                }),
-                ctx,
-            )
+            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -2164,14 +2154,7 @@ impl PageServerHandler {
        .await?;

        let n_blocks = timeline
-            .get_rel_size(
-                req.rel,
-                Version::LsnRange(LsnRange {
-                    effective_lsn: lsn,
-                    request_lsn: req.hdr.request_lsn,
-                }),
-                ctx,
-            )
+            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -2198,15 +2181,7 @@ impl PageServerHandler {
        .await?;

        let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::LsnRange(LsnRange {
-                    effective_lsn: lsn,
-                    request_lsn: req.hdr.request_lsn,
-                }),
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -2239,7 +2214,7 @@ impl PageServerHandler {
                // Ignore error (trace buffer may be full or tracer may have disconnected).
                _ = page_trace.try_send(PageTraceEvent {
                    key,
-                    effective_lsn: batch.lsn_range.effective_lsn,
+                    effective_lsn: batch.effective_request_lsn,
                    time,
                });
            }
@@ -2254,7 +2229,7 @@ impl PageServerHandler {
                    perf_instrument = true;
                }

-                req.lsn_range.effective_lsn
+                req.effective_request_lsn
            })
            .max()
            .expect("batch is never empty");
@@ -2308,7 +2283,7 @@ impl PageServerHandler {
                    (
                        &p.req.rel,
                        &p.req.blkno,
-                        p.lsn_range,
+                        p.effective_request_lsn,
                        p.ctx.attached_child(),
                    )
                }),
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,9 +43,7 @@ use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
-    RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
-    RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
-    RELSIZE_SNAPSHOT_CACHE_MISSES,
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
 };
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
@@ -92,28 +90,6 @@ pub enum LsnForTimestamp {
    NoData(Lsn),
 }

-/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
-/// See comments libs/pageserver_api/src/models.rs.
-/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
-/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
-#[derive(Debug, Clone, Copy, Default)]
-pub struct LsnRange {
-    pub effective_lsn: Lsn,
-    pub request_lsn: Lsn,
-}
-
-impl LsnRange {
-    pub fn at(lsn: Lsn) -> LsnRange {
-        LsnRange {
-            effective_lsn: lsn,
-            request_lsn: lsn,
-        }
-    }
-    pub fn is_latest(&self) -> bool {
-        self.request_lsn == Lsn::MAX
-    }
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CalculateLogicalSizeError {
    #[error("cancelled")]
@@ -226,13 +202,13 @@ impl Timeline {
        io_concurrency: IoConcurrency,
    ) -> Result<Bytes, PageReconstructError> {
        match version {
-            Version::LsnRange(lsns) => {
+            Version::Lsn(effective_lsn) => {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages
-                            .iter()
-                            .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
+                        pages.iter().map(|(tag, blknum)| {
+                            (tag, blknum, effective_lsn, ctx.attached_child())
+                        }),
                        io_concurrency.clone(),
                        ctx,
                    )
@@ -270,7 +246,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
    ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -289,7 +265,7 @@ impl Timeline {
        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
            HashMap::with_capacity(pages.len());

-        for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
+        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -298,7 +274,7 @@ impl Timeline {
                slots_filled += 1;
                continue;
            }
-            let lsn = lsns.effective_lsn;
+
            let nblocks = {
                let ctx = RequestContextBuilder::from(&ctx)
                    .perf_span(|crnt_perf_span| {
@@ -313,7 +289,7 @@ impl Timeline {
                    .attached_child();

                match self
-                    .get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
+                    .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                    .await
                {
@@ -494,7 +470,7 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(nblocks);
        }

@@ -512,7 +488,7 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        self.update_cached_rel_size(tag, version, nblocks);
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);

        Ok(nblocks)
    }
@@ -534,7 +510,7 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(true);
        }
        // then check if the database was already initialized.
@@ -610,7 +586,7 @@ impl Timeline {
        // scan directory listing (new), merge with the old results
        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf.get_vectored_concurrent_io,
+            self.conf,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -656,7 +632,7 @@ impl Timeline {
    ) -> Result<Bytes, PageReconstructError> {
        assert!(self.tenant_shard_id.is_shard_zero());
        let n_blocks = self
-            .get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
+            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
            .await?;

        let keyspace = KeySpace::single(
@@ -669,7 +645,7 @@ impl Timeline {
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf.get_vectored_concurrent_io,
+            self.conf,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -891,11 +867,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                .await?;

            let keyspace = KeySpace::single(
@@ -909,7 +885,7 @@ impl Timeline {
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf.get_vectored_concurrent_io,
+                self.conf,
                self.gate
                    .enter()
                    .map_err(|_| PageReconstructError::Cancelled)?,
@@ -1161,7 +1137,7 @@ impl Timeline {
        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
+                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
                .await?
            {
                if self.cancel.is_cancelled() {
@@ -1236,7 +1212,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::at(lsn), ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
                .await?
                .into_iter()
                .collect();
@@ -1353,75 +1329,59 @@ impl Timeline {
        Ok((dense_keyspace, sparse_keyspace))
    }

-    /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
-    /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
-    /// at the particular LSN (snapshot).
-    pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
-        let lsn = version.get_lsn();
-        {
-            let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
-            if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
-                if lsn >= *cached_lsn {
-                    RELSIZE_LATEST_CACHE_HITS.inc();
-                    return Some(*nblocks);
-                }
-                RELSIZE_CACHE_MISSES_OLD.inc();
+    /// Get cached size of relation if it not updated after specified LSN
+    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
+        let rel_size_cache = self.rel_size_cache.read().unwrap();
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
+            if lsn >= *cached_lsn {
+                RELSIZE_CACHE_HITS.inc();
+                return Some(*nblocks);
            }
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-        {
-            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
-            if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
-                RELSIZE_SNAPSHOT_CACHE_HITS.inc();
-                return Some(*nblock);
-            }
-        }
-        if version.is_latest() {
-            RELSIZE_LATEST_CACHE_MISSES.inc();
-        } else {
-            RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
-        }
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

    /// Update cached relation size if there is no more recent update
-    pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
-        let lsn = version.get_lsn();
-        if version.is_latest() {
-            let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
-            match rel_size_cache.entry(tag) {
-                hash_map::Entry::Occupied(mut entry) => {
-                    let cached_lsn = entry.get_mut();
-                    if lsn >= cached_lsn.0 {
-                        *cached_lsn = (lsn, nblocks);
-                    }
-                }
-                hash_map::Entry::Vacant(entry) => {
-                    entry.insert((lsn, nblocks));
-                    RELSIZE_LATEST_CACHE_ENTRIES.inc();
+    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
+            hash_map::Entry::Occupied(mut entry) => {
+                let cached_lsn = entry.get_mut();
+                if lsn >= cached_lsn.0 {
+                    *cached_lsn = (lsn, nblocks);
                }
            }
-        } else {
-            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
-            if rel_size_cache.capacity() != 0 {
-                rel_size_cache.insert((lsn, tag), nblocks);
-                RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
+            hash_map::Entry::Vacant(entry) => {
+                entry.insert((lsn, nblocks));
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }

    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
-        if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
-            RELSIZE_LATEST_CACHE_ENTRIES.inc();
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
        }
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
-        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
-        if rel_size_cache.remove(tag).is_some() {
-            RELSIZE_LATEST_CACHE_ENTRIES.dec();
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
        }
    }
 }
@@ -1625,10 +1585,7 @@ impl DatadirModification<'_> {
        //       check the cache too. This is because eagerly checking the cache results in
        //       less work overall and 10% better performance. It's more work on cache miss
        //       but cache miss is rare.
-        if let Some(nblocks) = self
-            .tline
-            .get_cached_rel_size(&rel, Version::Modified(self))
-        {
+        if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
            Ok(nblocks)
        } else if !self
            .tline
@@ -2710,7 +2667,7 @@ pub struct DatadirModificationStats {
 /// timeline to not miss the latest updates.
 #[derive(Clone, Copy)]
 pub enum Version<'a> {
-    LsnRange(LsnRange),
+    Lsn(Lsn),
    Modified(&'a DatadirModification<'a>),
 }

@@ -2722,7 +2679,7 @@ impl Version<'_> {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        match self {
-            Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
+            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
            Version::Modified(modification) => modification.get(key, ctx).await,
        }
    }
@@ -2744,26 +2701,12 @@ impl Version<'_> {
        }
    }

-    pub fn is_latest(&self) -> bool {
+    fn get_lsn(&self) -> Lsn {
        match self {
-            Version::LsnRange(lsns) => lsns.is_latest(),
-            Version::Modified(_) => true,
-        }
-    }
-
-    pub fn get_lsn(&self) -> Lsn {
-        match self {
-            Version::LsnRange(lsns) => lsns.effective_lsn,
+            Version::Lsn(lsn) => *lsn,
            Version::Modified(modification) => modification.lsn,
        }
    }
-
-    pub fn at(lsn: Lsn) -> Self {
-        Version::LsnRange(LsnRange {
-            effective_lsn: lsn,
-            request_lsn: lsn,
-        })
-    }
 }

 //--- Metadata structs stored in key-value pairs in the repository.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -86,8 +86,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
-    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
-    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
+    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
@@ -3348,13 +3348,6 @@ impl TenantShard {
                activated_timelines += 1;
            }

-            let tid = self.tenant_shard_id.tenant_id.to_string();
-            let shard_id = self.tenant_shard_id.shard_slug().to_string();
-            let offloaded_timeline_count = timelines_offloaded_accessor.len();
-            TENANT_OFFLOADED_TIMELINES
-                .with_label_values(&[&tid, &shard_id])
-                .set(offloaded_timeline_count as u64);
-
            self.state.send_modify(move |current_state| {
                assert!(
                    matches!(current_state, TenantState::Activating(_)),
@@ -4594,7 +4587,7 @@ impl TenantShard {

            target.cutoffs = GcCutoffs {
                space: space_cutoff,
-                time: None,
+                time: Lsn::INVALID,
            };
        }
    }
@@ -4678,7 +4671,7 @@ impl TenantShard {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
                    }
                }

@@ -4691,15 +4684,13 @@ impl TenantShard {
                    } else {
                        0
                    });
-                if let Some(time_cutoff) = target.cutoffs.time {
-                    timeline.metrics.pitr_history_size.set(
-                        timeline
-                            .get_last_record_lsn()
-                            .checked_sub(time_cutoff)
-                            .unwrap_or_default()
-                            .0,
-                    );
-                }
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.time)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -4708,8 +4699,8 @@ impl TenantShard {
                    let original_cutoffs = target.cutoffs.clone();
                    // GC cutoffs should never go back
                    target.cutoffs = GcCutoffs {
-                        space: cutoffs.space.max(original_cutoffs.space),
-                        time: cutoffs.time.max(original_cutoffs.time),
+                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
+                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
                    }
                }
            }
@@ -5569,14 +5560,6 @@ impl TenantShard {
            }
        }

-        // Update metrics
-        let tid = self.tenant_shard_id.to_string();
-        let shard_id = self.tenant_shard_id.shard_slug().to_string();
-        let set_key = &[tid.as_str(), shard_id.as_str()][..];
-        TENANT_OFFLOADED_TIMELINES
-            .with_label_values(set_key)
-            .set(manifest.offloaded_timelines.len() as u64);
-
        // Upload the manifest. Remote storage does no retries internally, so retry here.
        match backoff::retry(
            || async {
@@ -8613,10 +8596,8 @@ mod tests {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let io_concurrency = IoConcurrency::spawn_from_conf(
-            tline.conf.get_vectored_concurrent_io,
-            tline.gate.enter().unwrap(),
-        );
+        let io_concurrency =
+            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
@@ -8954,7 +8935,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Some(Lsn(0x30));
+            guard.cutoffs.time = Lsn(0x30);
            guard.cutoffs.space = Lsn(0x30);
        }

@@ -9062,7 +9043,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Some(Lsn(0x40));
+            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -9480,7 +9461,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -9564,7 +9545,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Some(Lsn(0x40));
+            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -10035,7 +10016,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10098,7 +10079,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time.unwrap_or_default()
+                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10176,7 +10157,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Some(Lsn(0x38));
+            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
        tline
@@ -10284,7 +10265,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10347,7 +10328,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time.unwrap_or_default()
+                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10533,7 +10514,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x10)),
+                    time: Lsn(0x10),
                    space: Lsn(0x10),
                },
                leases: Default::default(),
@@ -10553,7 +10534,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x50)),
+                    time: Lsn(0x50),
                    space: Lsn(0x50),
                },
                leases: Default::default(),
@@ -11274,7 +11255,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11663,7 +11644,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11726,7 +11707,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time.unwrap_or_default()
+                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
@@ -11915,7 +11896,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11978,7 +11959,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time.unwrap_or_default()
+                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
@@ -12241,7 +12222,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Some(Lsn(0x30)),
+                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
        // than our internal space cutoff.  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
        // the space cutoff.
-        let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None
+        let mut next_pitr_cutoff = gc_info.cutoffs.time;

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -31,7 +31,6 @@ pub use inmemory_layer::InMemoryLayer;
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
-use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
@@ -44,6 +43,7 @@ use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
+use crate::config::PageServerConf;
 use crate::context::{
    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
@@ -318,10 +318,11 @@ impl IoConcurrency {
    }

    pub(crate) fn spawn_from_conf(
-        conf: GetVectoredConcurrentIo,
+        conf: &'static PageServerConf,
        gate_guard: GateGuard,
    ) -> IoConcurrency {
-        let selected = match conf {
+        use pageserver_api::config::GetVectoredConcurrentIo;
+        let selected = match conf.get_vectored_concurrent_io {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
        };
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -63,28 +63,7 @@ pub struct InMemoryLayer {

    opened_at: Instant,

-    /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The [`IndexEntry`] is an offset into the
-    /// ephemeral file where the page version is stored.
-    ///
-    /// We use a separate lock for the index to reduce the critical section
-    /// during which reads cannot be planned.
-    ///
-    /// If you need access to both the index and the underlying file at the same time,
-    /// respect the following locking order to avoid deadlocks:
-    /// 1. [`InMemoryLayer::inner`]
-    /// 2. [`InMemoryLayer::index`]
-    ///
-    /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
-    /// so it is not necessary to hold simultaneous locks on index.
-    /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
-    /// In particular:
-    /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
-    /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
-    index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
-
-    /// The above fields never change, except for `end_lsn`, which is only set once,
-    /// and `index` (see rationale there).
+    /// The above fields never change, except for `end_lsn`, which is only set once.
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,

@@ -102,6 +81,11 @@ impl std::fmt::Debug for InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
+    /// All versions of all pages in the layer are kept here. Indexed
+    /// by block number and LSN. The [`IndexEntry`] is an offset into the
+    /// ephemeral file where the page version is stored.
+    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
+
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
@@ -121,7 +105,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
    trailing_ones
 };

-/// See [`InMemoryLayer::index`].
+/// See [`InMemoryLayerInner::index`].
 ///
 /// For memory efficiency, the data is packed into a u64.
 ///
@@ -441,7 +425,7 @@ impl InMemoryLayer {
            .page_content_kind(PageContentKind::InMemoryLayer)
            .attached_child();

-        let index = self.index.read().await;
+        let inner = self.inner.read().await;

        struct ValueRead {
            entry_lsn: Lsn,
@@ -451,7 +435,10 @@ impl InMemoryLayer {
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
+            for (key, vec_map) in inner
+                .index
+                .range(range.start.to_compact()..range.end.to_compact())
+            {
                let key = Key::from_compact(*key);
                let slice = vec_map.slice_range(lsn_range.clone());

@@ -479,7 +466,7 @@ impl InMemoryLayer {
                }
            }
        }
-        drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
        let read_from = Arc::clone(self);
        let read_ctx = ctx.attached_child();
        reconstruct_state
@@ -586,8 +573,8 @@ impl InMemoryLayer {
            start_lsn,
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
-            index: RwLock::new(BTreeMap::new()),
            inner: RwLock::new(InMemoryLayerInner {
+                index: BTreeMap::new(),
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
@@ -605,39 +592,31 @@ impl InMemoryLayer {
        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let (base_offset, metadata) = {
-            let mut inner = self.inner.write().await;
-            self.assert_writable();
+        let mut inner = self.inner.write().await;
+        self.assert_writable();

-            let base_offset = inner.file.len();
+        let base_offset = inner.file.len();

-            let SerializedValueBatch {
-                raw,
-                metadata,
-                max_lsn: _,
-                len: _,
-            } = serialized_batch;
+        let SerializedValueBatch {
+            raw,
+            metadata,
+            max_lsn: _,
+            len: _,
+        } = serialized_batch;

-            // Write the batch to the file
-            inner.file.write_raw(&raw, ctx).await?;
-            let new_size = inner.file.len();
+        // Write the batch to the file
+        inner.file.write_raw(&raw, ctx).await?;
+        let new_size = inner.file.len();

-            let expected_new_len = base_offset
-                .checked_add(raw.len().into_u64())
-                // write_raw would error if we were to overflow u64.
-                // also IndexEntry and higher levels in
-                //the code don't allow the file to grow that large
-                .unwrap();
-            assert_eq!(new_size, expected_new_len);
-
-            inner.resource_units.maybe_publish_size(new_size);
-
-            (base_offset, metadata)
-        };
+        let expected_new_len = base_offset
+            .checked_add(raw.len().into_u64())
+            // write_raw would error if we were to overflow u64.
+            // also IndexEntry and higher levels in
+            //the code don't allow the file to grow that large
+            .unwrap();
+        assert_eq!(new_size, expected_new_len);

        // Update the index with the new entries
-        let mut index = self.index.write().await;
-
        for meta in metadata {
            let SerializedValueMeta {
                key,
@@ -660,7 +639,7 @@ impl InMemoryLayer {
                will_init,
            })?;

-            let vec_map = index.entry(key).or_default();
+            let vec_map = inner.index.entry(key).or_default();
            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
            if old.is_some() {
                // This should not break anything, but is unexpected: ingestion code aims to filter out
@@ -679,6 +658,8 @@ impl InMemoryLayer {
            );
        }

+        inner.resource_units.maybe_publish_size(new_size);
+
        Ok(())
    }

@@ -699,18 +680,6 @@ impl InMemoryLayer {

    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    ///
-    /// A note on locking:
-    /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
-    /// writes while freezing the layer. This is enforced at a higher level via
-    /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
-    /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
-    ///    Timeline::write_lock for its lifetime. The rolling is handled in
-    ///    [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
-    ///    so can't be called from different threads.
-    /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
-    ///    This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
-    ///    hence there can be no concurrent writes
    pub async fn freeze(&self, end_lsn: Lsn) {
        assert!(
            self.start_lsn < end_lsn,
@@ -731,8 +700,8 @@ impl InMemoryLayer {

        #[cfg(debug_assertions)]
        {
-            let index = self.index.read().await;
-            for vec_map in index.values() {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
                for (lsn, _) in vec_map.as_slice() {
                    assert!(*lsn < end_lsn);
                }
@@ -755,11 +724,14 @@ impl InMemoryLayer {
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
-        // write lock on it, so we shouldn't block anyone. See the comment on
-        // [`InMemoryLayer::freeze`] to understand how locking between the append path
-        // and layer flushing works.
+        // write lock on it, so we shouldn't block anyone. There's one exception
+        // though: another thread might have grabbed a reference to this layer
+        // in `get_layer_for_write' just before the checkpointer called
+        // `freeze`, and then `write_to_disk` on it. When the thread gets the
+        // lock, it will see that it's not writeable anymore and retry, but it
+        // would have to wait until we release it. That race condition is very
+        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;
-        let index = self.index.read().await;

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
@@ -771,9 +743,13 @@ impl InMemoryLayer {
        let key_count = if let Some(key_range) = key_range {
            let key_range = key_range.start.to_compact()..key_range.end.to_compact();

-            index.iter().filter(|(k, _)| key_range.contains(k)).count()
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .count()
        } else {
-            index.len()
+            inner.index.len()
        };
        if key_count == 0 {
            return Ok(None);
@@ -796,7 +772,7 @@ impl InMemoryLayer {
                let file_contents = inner.file.load_to_io_buf(ctx).await?;
                let file_contents = file_contents.freeze();

-                for (key, vec_map) in index.iter() {
+                for (key, vec_map) in inner.index.iter() {
                    // Write all page versions
                    for (lsn, entry) in vec_map
                        .as_slice()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,6 @@ pub mod span;
 pub mod uninit;
 mod walreceiver;

-use hashlink::LruCache;
 use std::array;
 use std::cmp::{max, min};
 use std::collections::btree_map::Entry;
@@ -198,6 +197,16 @@ pub struct TimelineResources {
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
    pub(crate) conf: &'static PageServerConf,
    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -356,8 +365,7 @@ pub struct Timeline {
    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
-    pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
-    pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,

    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

@@ -529,24 +537,29 @@ impl GcInfo {
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
 /// is a single number (the oldest LSN which we must retain), but it internally distinguishes
 /// between time-based and space-based retention for observability and consumption metrics purposes.
-#[derive(Clone, Debug, Default)]
+#[derive(Debug, Clone)]
 pub(crate) struct GcCutoffs {
    /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
    /// history we must keep to retain a specified number of bytes of WAL.
    pub(crate) space: Lsn,

-    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
-    /// how much history we must keep to enable reading back at least the PITR interval duration.
-    ///
-    /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
-    /// Some(last_record_lsn).
-    pub(crate) time: Option<Lsn>,
+    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
+    /// history we must keep to enable reading back at least the PITR interval duration.
+    pub(crate) time: Lsn,
+}
+
+impl Default for GcCutoffs {
+    fn default() -> Self {
+        Self {
+            space: Lsn::INVALID,
+            time: Lsn::INVALID,
+        }
+    }
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        // NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
-        self.space.min(self.time.unwrap_or_default())
+        std::cmp::min(self.space, self.time)
    }
 }

@@ -1083,14 +1096,11 @@ impl Timeline {
    /// Get the bytes written since the PITR cutoff on this branch, and
    /// whether this branch's ancestor_lsn is within its parent's PITR.
    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
-        // TODO: for backwards compatibility, we return the full history back to 0 when the PITR
-        // cutoff has not yet been initialized. This should return None instead, but this is exposed
-        // in external HTTP APIs and callers may not handle a null value.
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.time.unwrap_or_default())
-            .unwrap_or_default()
+            .checked_sub(gc_info.cutoffs.time)
+            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
    }
@@ -1100,10 +1110,9 @@ impl Timeline {
        self.applied_gc_cutoff_lsn.read()
    }

-    /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
-    /// to read (based on configured PITR), even if physically we have more history. Returns None
-    /// if the PITR cutoff has not yet been initialized.
-    pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
+    /// Read timeline's planned GC cutoff: this is the logical end of history that users
+    /// are allowed to read (based on configured PITR), even if physically we have more history.
+    pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
        self.gc_info.read().unwrap().cutoffs.time
    }

@@ -2811,13 +2820,6 @@ impl Timeline {

            self.remote_client.update_config(&new_conf.location);

-            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
-            if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
-                if new_capacity != rel_size_cache.capacity() {
-                    rel_size_cache.set_capacity(new_capacity);
-                }
-            }
-
            self.metrics
                .evictions_with_low_residence_duration
                .write()
@@ -2876,14 +2878,6 @@ impl Timeline {
            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
        }

-        let relsize_snapshot_cache_capacity = {
-            let loaded_tenant_conf = tenant_conf.load();
-            loaded_tenant_conf
-                .tenant_conf
-                .relsize_snapshot_cache_capacity
-                .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
-        };
-
        Arc::new_cyclic(|myself| {
            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
@@ -2975,8 +2969,10 @@ impl Timeline {
                last_image_layer_creation_check_instant: Mutex::new(None),

                last_received_wal: Mutex::new(None),
-                rel_size_latest_cache: RwLock::new(HashMap::new()),
-                rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),

                download_all_remote_layers_task_info: RwLock::new(None),

@@ -3534,7 +3530,7 @@ impl Timeline {
                };

                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    self_ref.conf.get_vectored_concurrent_io,
+                    self_ref.conf,
                    self_ref
                        .gate
                        .enter()
@@ -5563,7 +5559,7 @@ impl Timeline {
            });

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf.get_vectored_concurrent_io,
+                self.conf,
                self.gate
                    .enter()
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
@@ -6234,12 +6230,14 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        if cfg!(test) && pitr == Duration::ZERO {
+        if cfg!(test) {
            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
-            return Ok(GcCutoffs {
-                time: Some(self.get_last_record_lsn()),
-                space: space_cutoff,
-            });
+            if pitr == Duration::ZERO {
+                return Ok(GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
+                });
+            }
        }

        // Calculate a time-based limit on how much to retain:
@@ -6253,14 +6251,14 @@ impl Timeline {
                // PITR is not set. Retain the size-based limit, or the default time retention,
                // whichever requires less data.
                GcCutoffs {
-                    time: Some(self.get_last_record_lsn()),
+                    time: self.get_last_record_lsn(),
                    space: std::cmp::max(time_cutoff, space_cutoff),
                }
            }
            (Duration::ZERO, None) => {
                // PITR is not set, and time lookup failed
                GcCutoffs {
-                    time: Some(self.get_last_record_lsn()),
+                    time: self.get_last_record_lsn(),
                    space: space_cutoff,
                }
            }
@@ -6268,7 +6266,7 @@ impl Timeline {
                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                // cannot advance beyond what was already GC'd, and respect space-based retention
                GcCutoffs {
-                    time: Some(*self.get_applied_gc_cutoff_lsn()),
+                    time: *self.get_applied_gc_cutoff_lsn(),
                    space: space_cutoff,
                }
            }
@@ -6276,7 +6274,7 @@ impl Timeline {
                // PITR interval is set and we looked up timestamp successfully.  Ignore
                // size based retention and make time cutoff authoritative
                GcCutoffs {
-                    time: Some(time_cutoff),
+                    time: time_cutoff,
                    space: time_cutoff,
                }
            }
@@ -6329,7 +6327,7 @@ impl Timeline {
            )
        };

-        let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
+        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -6378,7 +6376,7 @@ impl Timeline {
    async fn gc_timeline(
        &self,
        space_cutoff: Lsn,
-        time_cutoff: Option<Lsn>, // None if uninitialized
+        time_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -6397,12 +6395,6 @@ impl Timeline {
            return Ok(result);
        }

-        let Some(time_cutoff) = time_cutoff else {
-            // The GC cutoff should have been computed by now, but let's be defensive.
-            info!("Nothing to GC: time_cutoff not yet computed");
-            return Ok(result);
-        };
-
        // We need to ensure that no one tries to read page versions or create
        // branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
        // for details. This will block until the old value is no longer in use.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1526,7 +1526,7 @@ impl Timeline {
        info!(
            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
                checked {layers_checked}/{layers_total} layers \
-                (latest_gc_cutoff={} pitr_cutoff={:?})",
+                (latest_gc_cutoff={} pitr_cutoff={})",
            layers_to_rewrite.len(),
            drop_layers.len(),
            *latest_gc_cutoff,
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
    );
    let io_concurrency = IoConcurrency::spawn_from_conf(
-        detached.conf.get_vectored_concurrent_io,
+        detached.conf,
        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
    );
    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1684,31 +1684,31 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1719,7 +1719,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x20)),
+                    Version::Lsn(Lsn(0x20)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1733,7 +1733,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x30)),
+                    Version::Lsn(Lsn(0x30)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1747,7 +1747,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x40)),
+                    Version::Lsn(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1760,7 +1760,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::at(Lsn(0x40)),
+                    Version::Lsn(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1774,7 +1774,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x50)),
+                    Version::Lsn(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1787,7 +1787,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::at(Lsn(0x50)),
+                    Version::Lsn(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1800,7 +1800,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::at(Lsn(0x50)),
+                    Version::Lsn(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1820,7 +1820,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                .await?,
            2
        );
@@ -1829,7 +1829,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x60)),
+                    Version::Lsn(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1842,7 +1842,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::at(Lsn(0x60)),
+                    Version::Lsn(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1854,7 +1854,7 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1863,7 +1863,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::at(Lsn(0x50)),
+                    Version::Lsn(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1880,7 +1880,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
                .await?,
            0
        );
@@ -1893,7 +1893,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
                .await?,
            2
        );
@@ -1902,7 +1902,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::at(Lsn(0x70)),
+                    Version::Lsn(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1915,7 +1915,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::at(Lsn(0x70)),
+                    Version::Lsn(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1932,7 +1932,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                .await?,
            1501
        );
@@ -1942,7 +1942,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blk,
-                        Version::at(Lsn(0x80)),
+                        Version::Lsn(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -1956,7 +1956,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1500,
-                    Version::at(Lsn(0x80)),
+                    Version::Lsn(Lsn(0x80)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1990,13 +1990,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            1
        );
@@ -2011,7 +2011,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
                .await?,
            false
        );
@@ -2029,13 +2029,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                .await?,
            1
        );
@@ -2077,26 +2077,26 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                .await?,
            relsize
        );
@@ -2110,7 +2110,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::at(lsn),
+                        Version::Lsn(lsn),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2131,7 +2131,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                .await?,
            1
        );
@@ -2144,7 +2144,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::at(Lsn(0x60)),
+                        Version::Lsn(Lsn(0x60)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2157,7 +2157,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                .await?,
            relsize
        );
@@ -2169,7 +2169,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::at(Lsn(0x50)),
+                        Version::Lsn(Lsn(0x50)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2193,13 +2193,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                .await?,
            relsize
        );
@@ -2212,7 +2212,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::at(Lsn(0x80)),
+                        Version::Lsn(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2250,7 +2250,7 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE + 1
        );
@@ -2264,7 +2264,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE
        );
@@ -2279,7 +2279,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE - 1
        );
@@ -2297,7 +2297,7 @@ mod tests {
            m.commit(&ctx).await?;
            assert_eq!(
                tline
-                    .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                    .await?,
                size as BlockNumber
            );
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #define InvalidRelFileNumber InvalidOid

-#define SMgrRelGetRelInfo(reln)				\
+#define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,12 +148,6 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#define NRelFileInfoInvalidate(rinfo) do { \
-		NInfoGetSpcOid(rinfo) = InvalidOid; \
-		NInfoGetDbOid(rinfo) = InvalidOid; \
-		NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
-	} while (0)
-
 #if PG_MAJORVERSION_NUM < 17
 #define ProcNumber BackendId
 #define INVALID_PROC_NUMBER InvalidBackendId
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -108,7 +108,7 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;

-static NRelFileInfo unlogged_build_rel_info;
+static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,14 +912,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
-			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdextend(reln, forkNum, blkno, buffer, skipFsync);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1006,14 +1000,8 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
-			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1388,14 +1376,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
-			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdread(reln, forkNum, blkno, buffer);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1481,14 +1463,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
-			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdreadv(reln, forknum, blocknum, buffers, nblocks);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1621,15 +1597,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-#if PG_MAJORVERSION_NUM >= 17
-				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
-#else
-				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
-#endif
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1699,11 +1666,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1744,10 +1706,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				return mdnblocks(reln, forknum);
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1817,11 +1775,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
-			{
-				mdtruncate(reln, forknum, old_blocks, nblocks);
-				return;
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1960,6 +1913,7 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 		neon_log(ERROR, "unlogged relation build is already in progress");
+	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1976,7 +1930,7 @@ neon_start_unlogged_build(SMgrRelation reln)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel_info = InfoFromSMgrRel(reln);
+			unlogged_build_rel = reln;
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
 #ifdef DEBUG_COMPARE_LOCAL
 			if (!IsParallelWorker())
@@ -1997,9 +1951,12 @@ neon_start_unlogged_build(SMgrRelation reln)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
 #endif

-	unlogged_build_rel_info = InfoFromSMgrRel(reln);
+	unlogged_build_rel = reln;
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;

+	/* Make the relation look like it's unlogged */
+	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
+
 	/*
 	 * Create the local file. In a parallel build, the leader is expected to
 	 * call this first and do it.
@@ -2026,16 +1983,17 @@ neon_start_unlogged_build(SMgrRelation reln)
 static void
 neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 {
-	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
+	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt((unlogged_build_rel_info)))));
+					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;

 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
+	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 	/*
 	 * In a parallel build, (only) the leader process performs the 2nd
@@ -2043,7 +2001,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	 */
 	if (IsParallelWorker())
 	{
-		NRelFileInfoInvalidate(unlogged_build_rel_info);
+		unlogged_build_rel = NULL;
 		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 	}
 	else
@@ -2064,11 +2022,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 {
 	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);

-	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
+	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(unlogged_build_rel_info))));
+					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2076,6 +2034,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		BlockNumber nblocks;

 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
+		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 		/*
 		 * Update the last-written LSN cache.
@@ -2096,6 +2055,9 @@ neon_end_unlogged_build(SMgrRelation reln)
 								InfoFromNInfoB(rinfob),
 								MAIN_FORKNUM);

+		/* Make the relation look permanent again */
+		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
+
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
@@ -2116,7 +2078,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 	}
-	NRelFileInfoInvalidate(unlogged_build_rel_info);
+	unlogged_build_rel = NULL;
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

@@ -2189,7 +2151,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			 * Forget about any build we might have had in progress. The local
 			 * file will be unlinked by smgrDoPendingDeletes()
 			 */
-			NRelFileInfoInvalidate(unlogged_build_rel_info);
+			unlogged_build_rel = NULL;
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 			break;

@@ -2201,7 +2163,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 		case XACT_EVENT_PRE_PREPARE:
 			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 			{
-				NRelFileInfoInvalidate(unlogged_build_rel_info);
+				unlogged_build_rel = NULL;
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -1145,19 +1145,18 @@ dotenv = ["python-dotenv"]

 [[package]]
 name = "flask-cors"
-version = "6.0.0"
-description = "A Flask extension simplifying CORS support"
+version = "5.0.0"
+description = "A Flask extension adding a decorator for CORS support"
 optional = false
-python-versions = "<4.0,>=3.9"
+python-versions = "*"
 groups = ["main"]
 files = [
-    {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
-    {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
+    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
+    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
 ]

 [package.dependencies]
-flask = ">=0.9"
-Werkzeug = ">=0.7"
+Flask = ">=0.9"

 [[package]]
 name = "frozenlist"
@@ -3170,24 +3169,19 @@ pbr = "*"

 [[package]]
 name = "setuptools"
-version = "78.1.1"
+version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
-    {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
+    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
+    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
 ]

 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
-core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
-type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]

 [[package]]
 name = "six"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -73,6 +73,7 @@ rustc-hash.workspace = true
 rustls.workspace = true
 rustls-native-certs.workspace = true
 rustls-pemfile.workspace = true
+ryu = "1"
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -127,4 +128,3 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -80,22 +80,10 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .field(&endpoint.url())
                    .finish(),
                #[cfg(any(test, feature = "testing"))]
-                ControlPlaneClient::PostgresMock(endpoint) => {
-                    let url = endpoint.url();
-                    match url::Url::parse(url) {
-                        Ok(mut url) => {
-                            let _ = url.set_password(Some("_redacted_"));
-                            let url = url.as_str();
-                            fmt.debug_tuple("ControlPlane::PostgresMock")
-                                .field(&url)
-                                .finish()
-                        }
-                        Err(_) => fmt
-                            .debug_tuple("ControlPlane::PostgresMock")
-                            .field(&url)
-                            .finish(),
-                    }
-                }
+                ControlPlaneClient::PostgresMock(endpoint) => fmt
+                    .debug_tuple("ControlPlane::PostgresMock")
+                    .field(&endpoint.url())
+                    .finish(),
                #[cfg(test)]
                ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
            },
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -1,13 +1,9 @@
-#[cfg(any(test, feature = "testing"))]
-use std::env;
 use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;

-#[cfg(any(test, feature = "testing"))]
-use anyhow::Context;
 use anyhow::{bail, ensure};
 use arc_swap::ArcSwapOption;
 use futures::future::Either;
@@ -39,8 +35,6 @@ use crate::scram::threadpool::ThreadPool;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::serverless::cancel_set::CancelSet;
 use crate::tls::client_config::compute_client_config_with_root_certs;
-#[cfg(any(test, feature = "testing"))]
-use crate::url::ApiUrl;
 use crate::{auth, control_plane, http, serverless, usage_metrics};

 project_git_version!(GIT_VERSION);
@@ -167,11 +161,8 @@ struct ProxyCliArgs {
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value_t = 1024)]
+    #[clap(long, default_value = "1024")]
    cancellation_ch_size: usize,
-    /// Cancellation ops batch size for redis
-    #[clap(long, default_value_t = 8)]
-    cancellation_batch_size: usize,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
@@ -551,12 +542,7 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(
-                        &mut redis_kv_client,
-                        rx_cancel,
-                        args.cancellation_batch_size,
-                    )
-                    .await?;
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;

                    drop(redis_kv_client);

@@ -783,13 +769,7 @@ fn build_auth_backend(

        #[cfg(any(test, feature = "testing"))]
        AuthBackendType::Postgres => {
-            let mut url: ApiUrl = args.auth_endpoint.parse()?;
-            if url.password().is_none() {
-                let password = env::var("PGPASSWORD")
-                    .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
-                url.set_password(Some(&password))
-                    .expect("Failed to set password");
-            }
+            let url = args.auth_endpoint.parse()?;
            let api = control_plane::client::mock::MockControlPlane::new(
                url,
                !args.is_private_access_proxy,
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -30,6 +30,8 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
 type IpSubnetKey = IpNet;

 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
+const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+const BATCH_SIZE: usize = 8;

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -229,13 +231,12 @@ impl CancelReplyOp {
 pub async fn handle_cancel_messages(
    client: &mut RedisKVClient,
    mut rx: mpsc::Receiver<CancelKeyOp>,
-    batch_size: usize,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::with_capacity(batch_size);
-    let mut pipeline = Pipeline::with_capacity(batch_size);
+    let mut batch = Vec::with_capacity(BATCH_SIZE);
+    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);

    loop {
-        if rx.recv_many(&mut batch, batch_size).await == 0 {
+        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
            warn!("shutting down cancellation queue");
            break Ok(());
        }
@@ -366,7 +367,8 @@ impl CancellationHandler {
            return Err(CancelError::InternalError);
        };

-        tx.try_send(op)
+        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
+            .await
            .map_err(|e| {
                tracing::warn!("failed to send GetCancelData for {key}: {e}");
            })
@@ -568,7 +570,7 @@ impl Session {
    }

    // Send the store key op to the cancellation handler and set TTL for the key
-    pub(crate) fn write_cancel_key(
+    pub(crate) async fn write_cancel_key(
        &self,
        cancel_closure: CancelClosure,
    ) -> Result<(), CancelError> {
@@ -594,14 +596,14 @@ impl Session {
            expire: CANCEL_KEY_TTL,
        };

-        let _ = tx.try_send(op).map_err(|e| {
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
        });
        Ok(())
    }

-    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
+    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
@@ -617,7 +619,7 @@ impl Session {
                .guard(RedisMsgKind::HDel),
        };

-        let _ = tx.try_send(op).map_err(|e| {
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
        });
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -244,7 +244,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session.write_cancel_key(node.cancel_closure.clone())?;
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/logging/json.rs
+++ b/proxy/src/logging/json.rs
@@ -0,0 +1,356 @@
+//! Vendoring of serde_json's string escaping code.
+//!
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
+//! Licensed by David Tolnay under MIT or Apache-2.0.
+//!
+//! With modifications by Conrad Ludgate on behalf of Neon.
+
+use std::fmt::{self, Write};
+
+use serde_json::ser::CharEscape;
+
+#[must_use]
+pub struct ValueSer<'buf> {
+    buf: &'buf mut Vec<u8>,
+}
+
+impl<'buf> ValueSer<'buf> {
+    pub fn new(buf: &'buf mut Vec<u8>) -> Self {
+        Self { buf }
+    }
+
+    #[inline]
+    pub fn serialize(self, value: &SerializedValue) {
+        self.buf.extend_from_slice(&value.0);
+    }
+
+    #[inline]
+    pub fn str(self, s: &str) {
+        format_escaped_str(self.buf, s);
+    }
+
+    #[inline]
+    pub fn str_args(self, s: fmt::Arguments) {
+        format_escaped_display(self.buf, s);
+    }
+
+    #[inline]
+    pub fn bytes_hex(self, s: &[u8]) {
+        self.str_args(format_args!("{s:x?}"));
+    }
+
+    #[inline]
+    pub fn int(self, x: impl itoa::Integer) {
+        write_int(x, self.buf);
+    }
+
+    #[inline]
+    pub fn float(self, x: impl ryu::Float) {
+        write_float(x, self.buf);
+    }
+
+    #[inline]
+    pub fn bool(self, x: bool) {
+        let bool = if x { "true" } else { "false" };
+        self.buf.extend_from_slice(bool.as_bytes());
+    }
+
+    #[inline]
+    pub fn map(self) -> MapSer<'buf> {
+        MapSer::new(self.buf)
+    }
+
+    #[inline]
+    #[expect(unused)]
+    pub fn list(self) -> ListSer<'buf> {
+        ListSer::new(self.buf)
+    }
+}
+
+pub struct MapSer<'buf> {
+    buf: &'buf mut Vec<u8>,
+    first: bool,
+}
+
+impl<'buf> MapSer<'buf> {
+    #[inline]
+    fn new(buf: &'buf mut Vec<u8>) -> Self {
+        buf.push(b'{');
+        Self { buf, first: true }
+    }
+
+    #[inline]
+    pub fn entry(&mut self, key: Escaped) -> ValueSer {
+        self.entry_inner(|b| key.write(b))
+    }
+
+    #[inline]
+    pub fn entry_escape(&mut self, key: &str) -> ValueSer {
+        self.entry_inner(|b| format_escaped_str(b, key))
+    }
+
+    #[inline]
+    pub fn entry_escape_args(&mut self, key: fmt::Arguments) -> ValueSer {
+        self.entry_inner(|b| format_escaped_display(b, key))
+    }
+
+    #[inline]
+    fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer {
+        if !self.first {
+            self.buf.push(b',');
+        }
+        self.first = false;
+
+        f(self.buf);
+
+        self.buf.push(b':');
+        ValueSer { buf: self.buf }
+    }
+}
+
+impl Drop for MapSer<'_> {
+    fn drop(&mut self) {
+        self.buf.push(b'}');
+    }
+}
+
+pub struct ListSer<'buf> {
+    buf: &'buf mut Vec<u8>,
+    first: bool,
+}
+
+impl<'buf> ListSer<'buf> {
+    #[inline]
+    fn new(buf: &'buf mut Vec<u8>) -> Self {
+        buf.push(b'[');
+        Self { buf, first: true }
+    }
+
+    #[expect(unused)]
+    #[inline]
+    fn entry(&mut self) -> ValueSer {
+        if !self.first {
+            self.buf.push(b',');
+        }
+        self.first = false;
+        ValueSer { buf: self.buf }
+    }
+}
+
+impl Drop for ListSer<'_> {
+    fn drop(&mut self) {
+        self.buf.push(b']');
+    }
+}
+
+#[derive(Clone)]
+pub struct SerializedValue(Box<[u8]>);
+
+impl SerializedValue {
+    #[inline]
+    pub fn str(s: &str) -> Self {
+        let mut v = vec![];
+        v.reserve_exact(2 + s.len());
+        format_escaped_str(&mut v, s);
+        Self(v.into_boxed_slice())
+    }
+
+    #[inline]
+    pub fn str_args(s: fmt::Arguments) -> Self {
+        if let Some(s) = s.as_str() {
+            return Self::str(s);
+        }
+
+        let mut v = vec![];
+        format_escaped_display(&mut v, s);
+        Self(v.into_boxed_slice())
+    }
+
+    #[inline]
+    pub fn bytes_hex(s: &[u8]) -> Self {
+        Self::str_args(format_args!("{s:x?}"))
+    }
+
+    #[inline]
+    pub fn int(x: impl itoa::Integer) -> Self {
+        Self(itoa::Buffer::new().format(x).as_bytes().into())
+    }
+
+    #[inline]
+    pub fn float(x: impl ryu::Float) -> Self {
+        Self(ryu::Buffer::new().format(x).as_bytes().into())
+    }
+
+    #[inline]
+    pub fn bool(x: bool) -> Self {
+        let bool = if x { "true" } else { "false" };
+        Self(bool.as_bytes().into())
+    }
+}
+
+/// Represents a string that didn't need escaping because it's already valid json string.
+#[derive(Clone, Copy)]
+pub struct Escaped(&'static str);
+
+impl Escaped {
+    pub const fn new(s: &'static str) -> Self {
+        let mut i = 0;
+        while i < s.len() {
+            let escape = ESCAPE[s.as_bytes()[i] as usize];
+            i += 1;
+            assert!(escape == 0, "const json string should not need escaping");
+        }
+
+        Self(s)
+    }
+
+    pub fn as_str(self) -> &'static str {
+        self.0
+    }
+
+    fn write(self, buf: &mut Vec<u8>) {
+        buf.push(b'"');
+        buf.extend_from_slice(self.0.as_bytes());
+        buf.push(b'"');
+    }
+}
+
+fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
+    b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
+}
+
+fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
+    b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
+}
+
+#[inline]
+fn char_escape_from_escape_table(escape: u8, byte: u8) -> CharEscape {
+    match escape {
+        self::BB => CharEscape::Backspace,
+        self::TT => CharEscape::Tab,
+        self::NN => CharEscape::LineFeed,
+        self::FF => CharEscape::FormFeed,
+        self::RR => CharEscape::CarriageReturn,
+        self::QU => CharEscape::Quote,
+        self::BS => CharEscape::ReverseSolidus,
+        self::UU => CharEscape::AsciiControl(byte),
+        _ => unreachable!(),
+    }
+}
+
+fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
+    writer.push(b'"');
+    let rest = format_escaped_str_contents(writer, value);
+    writer.extend_from_slice(rest);
+    writer.push(b'"');
+}
+
+fn format_escaped_display(writer: &mut Vec<u8>, args: fmt::Arguments) {
+    writer.push(b'"');
+
+    if let Some(s) = args.as_str() {
+        let rest = format_escaped_str_contents(writer, s);
+        writer.extend_from_slice(rest);
+    } else {
+        Collect { buf: writer }
+            .write_fmt(args)
+            .expect("formatting should not error");
+    }
+
+    writer.push(b'"');
+}
+
+struct Collect<'buf> {
+    buf: &'buf mut Vec<u8>,
+}
+
+impl fmt::Write for Collect<'_> {
+    fn write_str(&mut self, s: &str) -> fmt::Result {
+        let last = format_escaped_str_contents(self.buf, s);
+        self.buf.extend(last);
+        Ok(())
+    }
+}
+
+// writes any escape sequences, and returns the suffix still needed to be written.
+fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
+    let bytes = value.as_bytes();
+
+    let mut start = 0;
+
+    for (i, &byte) in bytes.iter().enumerate() {
+        let escape = ESCAPE[byte as usize];
+        if escape == 0 {
+            continue;
+        }
+
+        writer.extend_from_slice(&bytes[start..i]);
+
+        let char_escape = char_escape_from_escape_table(escape, byte);
+        write_char_escape(writer, char_escape);
+
+        start = i + 1;
+    }
+
+    &bytes[start..]
+}
+
+const BB: u8 = b'b'; // \x08
+const TT: u8 = b't'; // \x09
+const NN: u8 = b'n'; // \x0A
+const FF: u8 = b'f'; // \x0C
+const RR: u8 = b'r'; // \x0D
+const QU: u8 = b'"'; // \x22
+const BS: u8 = b'\\'; // \x5C
+const UU: u8 = b'u'; // \x00...\x1F except the ones above
+const __: u8 = 0;
+
+// Lookup table of escape sequences. A value of b'x' at index i means that byte
+// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
+static ESCAPE: [u8; 256] = [
+    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+    UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
+    UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
+    __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
+    __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
+];
+
+fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
+    let s = match char_escape {
+        CharEscape::Quote => b"\\\"",
+        CharEscape::ReverseSolidus => b"\\\\",
+        CharEscape::Solidus => b"\\/",
+        CharEscape::Backspace => b"\\b",
+        CharEscape::FormFeed => b"\\f",
+        CharEscape::LineFeed => b"\\n",
+        CharEscape::CarriageReturn => b"\\r",
+        CharEscape::Tab => b"\\t",
+        CharEscape::AsciiControl(byte) => {
+            static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
+            let bytes = &[
+                b'\\',
+                b'u',
+                b'0',
+                b'0',
+                HEX_DIGITS[(byte >> 4) as usize],
+                HEX_DIGITS[(byte & 0xF) as usize],
+            ];
+            return writer.extend_from_slice(bytes);
+        }
+    };
+
+    writer.extend_from_slice(s);
+}
--- a/proxy/src/logging/mod.rs
+++ b/proxy/src/logging/mod.rs
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -67,7 +67,6 @@ where
    }
 }

-#[tracing::instrument(skip_all)]
 pub async fn copy_bidirectional_client_compute<Client, Compute>(
    client: &mut Client,
    compute: &mut Compute,
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -383,7 +383,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session.write_cancel_key(node.cancel_closure.clone())?;
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -13,7 +13,6 @@ use crate::stream::Stream;
 use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};

 /// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
 pub(crate) async fn proxy_pass(
    client: impl AsyncRead + AsyncWrite + Unpin,
    compute: impl AsyncRead + AsyncWrite + Unpin,
@@ -94,7 +93,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
        }

-        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
+        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error

        res
    }
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
        use postgres_client::error::SqlState;
        // Here are errors that happens after the user successfully authenticated to the database.
        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
-        let non_retriable_pg_errors = matches!(
+        !matches!(
            self.code(),
            &SqlState::TOO_MANY_CONNECTIONS
                | &SqlState::OUT_OF_MEMORY
@@ -56,20 +56,8 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
                | &SqlState::T_R_SERIALIZATION_FAILURE
                | &SqlState::INVALID_CATALOG_NAME
                | &SqlState::INVALID_SCHEMA_NAME
-                | &SqlState::INVALID_PARAMETER_VALUE,
-        );
-        if non_retriable_pg_errors {
-            return false;
-        }
-        // PGBouncer errors that should not trigger a wake_compute retry.
-        if self.code() == &SqlState::PROTOCOL_VIOLATION {
-            // Source for the error message:
-            // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
-            return !self
-                .message()
-                .contains("no more connections allowed (max_client_conn)");
-        }
-        true
+                | &SqlState::INVALID_PARAMETER_VALUE
+        )
    }
 }

@@ -122,55 +110,3 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
        .base_delay
        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
-
-#[cfg(test)]
-mod tests {
-    use super::ShouldRetryWakeCompute;
-    use postgres_client::error::{DbError, SqlState};
-
-    #[test]
-    fn should_retry_wake_compute_for_db_error() {
-        // These SQLStates should NOT trigger a wake_compute retry.
-        let non_retry_states = [
-            SqlState::TOO_MANY_CONNECTIONS,
-            SqlState::OUT_OF_MEMORY,
-            SqlState::SYNTAX_ERROR,
-            SqlState::T_R_SERIALIZATION_FAILURE,
-            SqlState::INVALID_CATALOG_NAME,
-            SqlState::INVALID_SCHEMA_NAME,
-            SqlState::INVALID_PARAMETER_VALUE,
-        ];
-        for state in non_retry_states {
-            let err = DbError::new_test_error(state.clone(), "oops".to_string());
-            assert!(
-                !err.should_retry_wake_compute(),
-                "State {state:?} unexpectedly retried"
-            );
-        }
-
-        // Errors coming from pgbouncer should not trigger a wake_compute retry
-        let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
-        for error in non_retry_pgbouncer_errors {
-            let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
-            assert!(
-                !err.should_retry_wake_compute(),
-                "PGBouncer error {error:?} unexpectedly retried"
-            );
-        }
-
-        // These SQLStates should trigger a wake_compute retry.
-        let retry_states = [
-            SqlState::CONNECTION_FAILURE,
-            SqlState::CONNECTION_EXCEPTION,
-            SqlState::CONNECTION_DOES_NOT_EXIST,
-            SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        ];
-        for state in retry_states {
-            let err = DbError::new_test_error(state.clone(), "oops".to_string());
-            assert!(
-                err.should_retry_wake_compute(),
-                "State {state:?} unexpectedly skipped retry"
-            );
-        }
-    }
-}
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -15,7 +15,6 @@ use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
-use tracing_test::traced_test;

 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -382,14 +381,8 @@ enum ConnectAction {
    WakeFail,
    WakeRetry,
    Connect,
-    // connect_once -> Err, could_retry = true, should_retry_wake_compute = true
    Retry,
-    // connect_once -> Err, could_retry = true, should_retry_wake_compute = false
-    RetryNoWake,
-    // connect_once -> Err, could_retry = false, should_retry_wake_compute = true
    Fail,
-    // connect_once -> Err, could_retry = false, should_retry_wake_compute = false
-    FailNoWake,
 }

 #[derive(Clone)]
@@ -431,7 +424,6 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
    retryable: bool,
-    wakeable: bool,
    kind: crate::error::ErrorKind,
 }

@@ -456,7 +448,7 @@ impl CouldRetry for TestConnectError {
 }
 impl ShouldRetryWakeCompute for TestConnectError {
    fn should_retry_wake_compute(&self) -> bool {
-        self.wakeable
+        true
    }
 }

@@ -479,22 +471,10 @@ impl ConnectMechanism for TestConnectMechanism {
            ConnectAction::Connect => Ok(TestConnection),
            ConnectAction::Retry => Err(TestConnectError {
                retryable: true,
-                wakeable: true,
-                kind: ErrorKind::Compute,
-            }),
-            ConnectAction::RetryNoWake => Err(TestConnectError {
-                retryable: true,
-                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            ConnectAction::Fail => Err(TestConnectError {
                retryable: false,
-                wakeable: true,
-                kind: ErrorKind::Compute,
-            }),
-            ConnectAction::FailNoWake => Err(TestConnectError {
-                retryable: false,
-                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            x => panic!("expecting action {x:?}, connect is called instead"),
@@ -729,92 +709,3 @@ async fn wake_non_retry() {
        .unwrap_err();
    mechanism.verify();
 }
-
-#[tokio::test]
-#[traced_test]
-async fn fail_but_wake_invalidates_cache() {
-    let ctx = RequestContext::test();
-    let mech = TestConnectMechanism::new(vec![
-        ConnectAction::Wake,
-        ConnectAction::Fail,
-        ConnectAction::Wake,
-        ConnectAction::Connect,
-    ]);
-    let user = helper_create_connect_info(&mech);
-    let cfg = config();
-
-    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
-        .await
-        .unwrap();
-
-    assert!(logs_contain(
-        "invalidating stalled compute node info cache entry"
-    ));
-}
-
-#[tokio::test]
-#[traced_test]
-async fn fail_no_wake_skips_cache_invalidation() {
-    let ctx = RequestContext::test();
-    let mech = TestConnectMechanism::new(vec![
-        ConnectAction::Wake,
-        ConnectAction::FailNoWake,
-        ConnectAction::Connect,
-    ]);
-    let user = helper_create_connect_info(&mech);
-    let cfg = config();
-
-    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
-        .await
-        .unwrap();
-
-    assert!(!logs_contain(
-        "invalidating stalled compute node info cache entry"
-    ));
-}
-
-#[tokio::test]
-#[traced_test]
-async fn retry_but_wake_invalidates_cache() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-
-    let ctx = RequestContext::test();
-    // Wake → Retry (retryable + wakeable) → Wake → Connect
-    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    let cfg = config();
-
-    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
-        .await
-        .unwrap();
-    mechanism.verify();
-
-    // Because Retry has wakeable=true, we should see invalidate_cache
-    assert!(logs_contain(
-        "invalidating stalled compute node info cache entry"
-    ));
-}
-
-#[tokio::test]
-#[traced_test]
-async fn retry_no_wake_skips_invalidation() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-
-    let ctx = RequestContext::test();
-    // Wake → RetryNoWake (retryable + NOT wakeable)
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
-    let user_info = helper_create_connect_info(&mechanism);
-    let cfg = config();
-
-    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-
-    // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
-    assert!(!logs_contain(
-        "invalidating stalled compute node info cache entry"
-    ));
-}
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -13,19 +13,22 @@ pub(crate) struct Pbkdf2 {
 // inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
 impl Pbkdf2 {
    pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
-        // key the HMAC and derive the first block in-place
-        let mut hmac =
+        let hmac =
            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-        hmac.update(salt);
-        hmac.update(&1u32.to_be_bytes());
-        let init_block = hmac.finalize_reset().into_bytes();
+
+        let prev = hmac
+            .clone()
+            .chain_update(salt)
+            .chain_update(1u32.to_be_bytes())
+            .finalize()
+            .into_bytes();

        Self {
            hmac,
-            // one iteration spent above
+            // one consumed for the hash above
            iterations: iterations - 1,
-            hi: init_block,
-            prev: init_block,
+            hi: prev,
+            prev,
        }
    }

@@ -41,17 +44,14 @@ impl Pbkdf2 {
            iterations,
        } = self;

-        // only do up to 4096 iterations per turn for fairness
+        // only do 4096 iterations per turn before sharing the thread for fairness
        let n = (*iterations).clamp(0, 4096);
        for _ in 0..n {
-            hmac.update(prev);
-            let block = hmac.finalize_reset().into_bytes();
+            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();

-            for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
-                *hi_byte ^= b;
+            for (hi, prev) in hi.iter_mut().zip(*prev) {
+                *hi ^= prev;
            }
-
-            *prev = block;
        }

        *iterations -= n;
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,9 +14,7 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{HeaderMap, Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{
-    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
-};
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
@@ -1094,10 +1092,12 @@ async fn query_to_json<T: GenericClient>(
    let query_start = Instant::now();

    let query_params = data.params;
-    let mut row_stream = client
-        .query_raw_txt(&data.query, query_params)
-        .await
-        .map_err(SqlOverHttpError::Postgres)?;
+    let mut row_stream = std::pin::pin!(
+        client
+            .query_raw_txt(&data.query, query_params)
+            .await
+            .map_err(SqlOverHttpError::Postgres)?
+    );
    let query_acknowledged = Instant::now();

    // Manually drain the stream into a vector to leave row_stream hanging
@@ -1118,15 +1118,10 @@ async fn query_to_json<T: GenericClient>(
    }

    let query_resp_end = Instant::now();
-    let RowStream {
-        statement,
-        command_tag,
-        status: ready,
-        ..
-    } = row_stream;
+    let ready = row_stream.ready_status();

    // grab the command tag and number of rows affected
-    let command_tag = command_tag.unwrap_or_default();
+    let command_tag = row_stream.command_tag().unwrap_or_default();
    let mut command_tag_split = command_tag.split(' ');
    let command_tag_name = command_tag_split.next().unwrap_or_default();
    let command_tag_count = if command_tag_name == "INSERT" {
@@ -1147,11 +1142,11 @@ async fn query_to_json<T: GenericClient>(
        "finished executing query"
    );

-    let columns_len = statement.columns().len();
+    let columns_len = row_stream.columns().len();
    let mut fields = Vec::with_capacity(columns_len);
    let mut columns = Vec::with_capacity(columns_len);

-    for c in statement.columns() {
+    for c in row_stream.columns() {
        fields.push(json!({
            "name": c.name().to_owned(),
            "dataTypeID": c.type_().oid(),
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -43,12 +43,6 @@ impl std::ops::Deref for ApiUrl {
    }
 }

-impl std::ops::DerefMut for ApiUrl {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
-
 impl std::fmt::Display for ApiUrl {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.0.fmt(f)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -184,7 +184,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
-    "pageserver_tenant_offloaded_timelines",
    counter("pageserver_tenant_throttling_count_accounted_start"),
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -103,7 +103,7 @@ class AbstractNeonCli:
            else:
                stdout = ""

-            log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
            raise

        indent = "  "
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -187,7 +187,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
        },
        "rel_size_v2_enabled": True,
-        "relsize_snapshot_cache_capacity": 10000,
        "gc_compaction_enabled": True,
        "gc_compaction_verification": False,
        "gc_compaction_initial_threshold_kb": 1024000,
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -19,16 +19,6 @@ TEST_ROLE_NAMES = [
    {"name": "role$"},
    {"name": "role$$"},
    {"name": "role$x$"},
-    {"name": "x"},
-    {"name": "xx"},
-    {"name": "$x"},
-    {"name": "x$"},
-    {"name": "$x$"},
-    {"name": "xx$"},
-    {"name": "$xx"},
-    {"name": "$xx$"},
-    # 63 bytes is the limit for role/DB names in Postgres
-    {"name": "x" * 63},
 ]

 TEST_DB_NAMES = [
@@ -84,43 +74,6 @@ TEST_DB_NAMES = [
        "name": "db name$x$",
        "owner": "role$x$",
    },
-    {
-        "name": "x",
-        "owner": "x",
-    },
-    {
-        "name": "xx",
-        "owner": "xx",
-    },
-    {
-        "name": "$x",
-        "owner": "$x",
-    },
-    {
-        "name": "x$",
-        "owner": "x$",
-    },
-    {
-        "name": "$x$",
-        "owner": "$x$",
-    },
-    {
-        "name": "xx$",
-        "owner": "xx$",
-    },
-    {
-        "name": "$xx",
-        "owner": "$xx",
-    },
-    {
-        "name": "$xx$",
-        "owner": "$xx$",
-    },
-    # 63 bytes is the limit for role/DB names in Postgres
-    {
-        "name": "x" * 63,
-        "owner": "x" * 63,
-    },
 ]


@@ -193,10 +146,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
    """
    Test that compute_ctl can create and work with databases and roles
    with special characters (whitespaces, %, tabs, etc.) in the name.
-    Also use `drop_subscriptions_before_start: true`. We do not actually
-    have any subscriptions in this test, so it should be no-op, but it
-    i) simulates the case when we create a second dev branch together with
-    a new project creation, and ii) just generally stresses more code paths.
    """
    env = neon_simple_env

@@ -210,7 +159,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
-                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": TEST_ROLE_NAMES,
                    "databases": TEST_DB_NAMES,
@@ -254,7 +202,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
-                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": [],
                    "databases": [],
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -510,7 +510,7 @@ def list_elegible_layers(
        except KeyError:
            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
            # matches what's on disk.
-            log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
            raise

    return list(c for c in candidates if is_visible(c))
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    except:
        # On assertion failures, log some details to help with debugging
        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
+        log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
        raise

    # Scrub the remote storage
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -27,9 +27,8 @@ from contextlib import closing

 import psycopg2
 import pytest
-from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
 from fixtures.pg_version import PgVersion
 from fixtures.utils import query_scalar, skip_on_postgres, wait_until

@@ -696,110 +695,3 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
    with secondary.cursor() as secondary_cur:
        secondary_cur.execute("select count(*) from t")
        assert secondary_cur.fetchone() == (n_restarts,)
-
-
-def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin):
-    env = neon_simple_env
-    endpoint = env.endpoints.create_start("main")
-
-    sql = """
-CREATE TABLE CHAR_TBL(f1 char(4));
-CREATE TABLE FLOAT8_TBL(f1 float8);
-CREATE TABLE INT2_TBL(f1 int2);
-CREATE TABLE INT4_TBL(f1 int4);
-CREATE TABLE INT8_TBL(q1 int8, q2 int8);
-CREATE TABLE POINT_TBL(f1 point);
-CREATE TABLE TEXT_TBL (f1 text);
-CREATE TABLE VARCHAR_TBL(f1 varchar(4));
-CREATE TABLE onek (unique1		int4);
-CREATE TABLE onek2 AS SELECT * FROM onek;
-CREATE TABLE tenk1 (unique1		int4);
-CREATE TABLE tenk2 AS SELECT * FROM tenk1;
-CREATE TABLE person (name text, age int4,location point);
-CREATE TABLE emp (salary int4, manager name) INHERITS (person);
-CREATE TABLE student (gpa float8) INHERITS (person);
-CREATE TABLE stud_emp (	percent 	int4) INHERITS (emp, student);
-CREATE TABLE road (name		text,thepath 	path);
-CREATE TABLE ihighway () INHERITS (road);
-CREATE TABLE shighway(surface		text) INHERITS (road);
-CREATE TABLE BOOLTBL3 (d text, b bool, o int);
-CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool);
-DROP TABLE BOOLTBL3;
-DROP TABLE BOOLTBL4;
-CREATE TABLE ceil_floor_round (a numeric);
-DROP TABLE ceil_floor_round;
-CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
-DROP TABLE width_bucket_test;
-CREATE TABLE num_input_test (n1 numeric);
-CREATE TABLE num_variance (a numeric);
-INSERT INTO num_variance VALUES (0);
-CREATE TABLE snapshot_test (nr	integer, snap	txid_snapshot);
-CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now()));
-CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now()));
-CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
-CREATE INDEX guid1_hash  ON guid1 USING HASH  (guid_field);
-TRUNCATE guid1;
-DROP TABLE guid1;
-DROP TABLE guid2 CASCADE;
-CREATE TABLE numrange_test (nr NUMRANGE);
-CREATE INDEX numrange_test_btree on numrange_test(nr);
-CREATE TABLE numrange_test2(nr numrange);
-CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr);
-INSERT INTO numrange_test2 VALUES('[, 5)');
-CREATE TABLE textrange_test (tr text);
-CREATE INDEX textrange_test_btree on textrange_test(tr);
-CREATE TABLE test_range_gist(ir int4range);
-CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
-DROP INDEX test_range_gist_idx;
-CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
-CREATE TABLE test_range_spgist(ir int4range);
-CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
-DROP INDEX test_range_spgist_idx;
-CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
-CREATE TABLE test_range_elem(i int4);
-CREATE INDEX test_range_elem_idx on test_range_elem (i);
-CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10));
-DROP TABLE test_range_elem;
-CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&));
-CREATE TABLE f_test(f text, i int);
-CREATE TABLE i8r_array (f1 int, f2 text);
-CREATE TYPE arrayrange as range (subtype=int4[]);
-CREATE TYPE two_ints as (a int, b int);
-DROP TYPE two_ints cascade;
-CREATE TABLE text_support_test (t text);
-CREATE TABLE TEMP_FLOAT (f1 FLOAT8);
-CREATE TABLE TEMP_INT4 (f1 INT4);
-CREATE TABLE TEMP_INT2 (f1 INT2);
-CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8);
-CREATE TABLE POLYGON_TBL(f1 polygon);
-CREATE TABLE quad_poly_tbl (id int, p polygon);
-INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y;
-CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl;
-CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl;
-"""
-
-    with endpoint.cursor() as cur:
-        lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-        env.endpoints.create_start(branch_name="main", lsn=lsn)
-        log.info(f"lsn: {lsn}")
-
-        for line in sql.split("\n"):
-            if len(line.strip()) == 0 or line.startswith("--"):
-                continue
-            cur.execute(line)
-
-        lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-        env.endpoints.create_start(branch_name="main", lsn=lsn)
-        log.info(f"lsn: {lsn}")
-
-        cur.execute("VACUUM FULL pg_class;")
-
-    for ep in env.endpoints.endpoints:
-        log.info(f"{ep.endpoint_id} / {ep.pg_port}")
-        pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"]
-        env_vars = {
-            "PGPORT": str(ep.pg_port),
-            "PGUSER": endpoint.default_options["user"],
-            "PGHOST": endpoint.default_options["host"],
-        }
-        pg_bin.run_capture(pg_dump_command, env=env_vars)
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -193,11 +193,6 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
    )

-    offloaded_count = ps_http.get_metric_value(
-        "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
-    )
-    assert offloaded_count == 0
-
    ps_http.timeline_archival_config(
        tenant_id,
        leaf_timeline_id,
@@ -249,11 +244,6 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
    wait_until(leaf_offloaded)
    wait_until(parent_offloaded)

-    offloaded_count = ps_http.get_metric_value(
-        "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
-    )
-    assert offloaded_count == 2
-
    # Offloaded child timelines should still prevent deletion
    with pytest.raises(
        PageserverApiException,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.5",
-    "8be779fd3ab9e87206da96a7e4842ef1abf04f44"
+    "e5374b72997b0afc8374137674e873f7a558120a"
  ],
  "v16": [
    "16.9",
-    "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
+    "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
  ],
  "v15": [
    "15.13",
-    "de7640f55da07512834d5cc40c4b3fb376b5f04f"
+    "daa81cffcf063c54b29a9aabdb6604625f675ad0"
  ],
  "v14": [
    "14.18",
-    "55c0d45abe6467c02084c2192bca117eda6ce1e7"
+    "4cca6f8083483dda9e12eae292cf788d45bd561f"
  ]
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -107,7 +107,6 @@ tower = { version = "0.4", default-features = false, features = ["balance", "buf
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 tracing-log = { version = "0.2" }
-tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
Author	SHA1	Message	Date
Conrad Ludgate	9041907019	rewrite with custom json serializer	2025-05-18 13:41:43 +02:00
Conrad Ludgate	53fdcd252f	remove locking from extract, use refcell instead	2025-05-17 22:14:26 +02:00
Conrad Ludgate	f5c5b99b58	remove lasso from json logger, use field index for lookup	2025-05-17 22:14:26 +02:00
Conrad Ludgate	ac331090bf	refactor json logging state	2025-05-17 22:14:26 +02:00
Conrad Ludgate	176b5a8978	replace indexset with lasso and linear search	2025-05-17 22:14:26 +02:00
Conrad Ludgate	e0da7dd8e9	use faster hasher than siphash	2025-05-17 22:14:26 +02:00
Conrad Ludgate	547fe38abf	replace papaya with hashmap+lock. this assumes that spans are rarely accessed in parallel	2025-05-17 22:14:26 +02:00
Conrad Ludgate	c06f9635f5	remove tracing instrument on passthrough	2025-05-17 22:14:26 +02:00