fix accidental recursion

refactor statements and the type cache to avoid arcs
delete some more
2026-03-16 23:00:36 +00:00 · 2024-12-06 12:19:40 +00:00 · 2024-12-06 12:01:19 +00:00 · 2024-12-06 11:33:34 +00:00 · 2024-12-06 11:22:03 +00:00 · 2024-12-06 10:24:13 +00:00
37 changed files with 764 additions and 968 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -133,9 +133,9 @@ dependencies = [

 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 dependencies = [
 "backtrace",
 ]
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
    import 'sql_exporter/compute_max_connections.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
    import 'sql_exporter/compute_subscriptions_count.libsonnet',
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;

-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;

        // Temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -618,6 +619,7 @@ impl Endpoint {
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
            local_proxy_config: None,
+            reconfigure_concurrency: 1,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -817,6 +819,7 @@ impl Endpoint {
                self.http_address.ip(),
                self.http_address.port()
            ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
            .body(format!(
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;

+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
    /// that no pg catalog mutations (like role creation, database creation,
    /// extension creation) need to be done on the actual database to start.
    #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
-
    pub timeline_id: Option<TimelineId>,
-
    pub pageserver_connstring: Option<String>,

    #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
    /// Local Proxy configuration used for JWT authentication
    #[serde(default)]
    pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {

        // Features list defaults to empty vector.
        assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
    }

    #[test]
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
        key_to_shard_number(self.count, self.stripe_size, key)
    }

-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
    ///
    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
            // Special keys that are only stored on shard 0
            false
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -4,23 +4,18 @@ use crate::config::Host;
 use crate::config::SslMode;
 use crate::connection::{Request, RequestMessages};

-use crate::query::RowStream;
-use crate::simple_query::SimpleQueryStream;
-
-use crate::types::{Oid, ToSql, Type};
+use crate::types::{Oid, Type};

 use crate::{
-    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+    simple_query, CancelToken, Error, ReadyForQueryStatus, Statement, Transaction,
+    TransactionBuilder,
 };
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
-use futures_util::{future, ready, TryStreamExt};
-use parking_lot::Mutex;
+use futures_util::{future, ready};
 use postgres_protocol2::message::{backend::Message, frontend};
 use std::collections::HashMap;
 use std::fmt;
-use std::sync::Arc;
 use std::task::{Context, Poll};
 use tokio::sync::mpsc;

@@ -55,7 +50,7 @@ impl Responses {
 /// A cache of type info and prepared statements for fetching type info
 /// (corresponding to the queries in the [prepare] module).
 #[derive(Default)]
-struct CachedTypeInfo {
+pub(crate) struct CachedTypeInfo {
    /// A statement for basic information for a type from its
    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
    /// fallback).
@@ -71,13 +66,45 @@ struct CachedTypeInfo {
    /// Cache of types already looked up.
    types: HashMap<Oid, Type>,
 }
+impl CachedTypeInfo {
+    pub(crate) fn typeinfo(&mut self) -> Option<&Statement> {
+        self.typeinfo.as_ref()
+    }
+
+    pub(crate) fn set_typeinfo(&mut self, statement: Statement) -> &Statement {
+        self.typeinfo.insert(statement)
+    }
+
+    pub(crate) fn typeinfo_composite(&mut self) -> Option<&Statement> {
+        self.typeinfo_composite.as_ref()
+    }
+
+    pub(crate) fn set_typeinfo_composite(&mut self, statement: Statement) -> &Statement {
+        self.typeinfo_composite.insert(statement)
+    }
+
+    pub(crate) fn typeinfo_enum(&mut self) -> Option<&Statement> {
+        self.typeinfo_enum.as_ref()
+    }
+
+    pub(crate) fn set_typeinfo_enum(&mut self, statement: Statement) -> &Statement {
+        self.typeinfo_enum.insert(statement)
+    }
+
+    pub(crate) fn type_(&mut self, oid: Oid) -> Option<Type> {
+        self.types.get(&oid).cloned()
+    }
+
+    pub(crate) fn set_type(&mut self, oid: Oid, type_: &Type) {
+        self.types.insert(oid, type_.clone());
+    }
+}

 pub struct InnerClient {
    sender: mpsc::UnboundedSender<Request>,
-    cached_typeinfo: Mutex<CachedTypeInfo>,

    /// A buffer to use when writing out postgres commands.
-    buffer: Mutex<BytesMut>,
+    buffer: BytesMut,
 }

 impl InnerClient {
@@ -92,47 +119,14 @@ impl InnerClient {
        })
    }

-    pub fn typeinfo(&self) -> Option<Statement> {
-        self.cached_typeinfo.lock().typeinfo.clone()
-    }
-
-    pub fn set_typeinfo(&self, statement: &Statement) {
-        self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
-    }
-
-    pub fn typeinfo_composite(&self) -> Option<Statement> {
-        self.cached_typeinfo.lock().typeinfo_composite.clone()
-    }
-
-    pub fn set_typeinfo_composite(&self, statement: &Statement) {
-        self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
-    }
-
-    pub fn typeinfo_enum(&self) -> Option<Statement> {
-        self.cached_typeinfo.lock().typeinfo_enum.clone()
-    }
-
-    pub fn set_typeinfo_enum(&self, statement: &Statement) {
-        self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
-    }
-
-    pub fn type_(&self, oid: Oid) -> Option<Type> {
-        self.cached_typeinfo.lock().types.get(&oid).cloned()
-    }
-
-    pub fn set_type(&self, oid: Oid, type_: &Type) {
-        self.cached_typeinfo.lock().types.insert(oid, type_.clone());
-    }
-
    /// Call the given function with a buffer to be used when writing out
    /// postgres commands.
-    pub fn with_buf<F, R>(&self, f: F) -> R
+    pub fn with_buf<F, R>(&mut self, f: F) -> R
    where
        F: FnOnce(&mut BytesMut) -> R,
    {
-        let mut buffer = self.buffer.lock();
-        let r = f(&mut buffer);
-        buffer.clear();
+        let r = f(&mut self.buffer);
+        self.buffer.clear();
        r
    }
 }
@@ -150,7 +144,8 @@ pub struct SocketConfig {
 /// The client is one half of what is returned when a connection is established. Users interact with the database
 /// through this client object.
 pub struct Client {
-    inner: Arc<InnerClient>,
+    pub(crate) inner: InnerClient,
+    pub(crate) cached_typeinfo: CachedTypeInfo,

    socket_config: SocketConfig,
    ssl_mode: SslMode,
@@ -167,11 +162,11 @@ impl Client {
        secret_key: i32,
    ) -> Client {
        Client {
-            inner: Arc::new(InnerClient {
+            inner: InnerClient {
                sender,
-                cached_typeinfo: Default::default(),
                buffer: Default::default(),
-            }),
+            },
+            cached_typeinfo: Default::default(),

            socket_config,
            ssl_mode,
@@ -185,161 +180,6 @@ impl Client {
        self.process_id
    }

-    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
-        &self.inner
-    }
-
-    /// Creates a new prepared statement.
-    ///
-    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
-    /// which are set when executed. Prepared statements can only be used with the connection that created them.
-    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
-        self.prepare_typed(query, &[]).await
-    }
-
-    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
-    ///
-    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
-    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
-    pub async fn prepare_typed(
-        &self,
-        query: &str,
-        parameter_types: &[Type],
-    ) -> Result<Statement, Error> {
-        prepare::prepare(&self.inner, query, parameter_types).await
-    }
-
-    /// Executes a statement, returning a vector of the resulting rows.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn query<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<Vec<Row>, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.query_raw(statement, slice_iter(params))
-            .await?
-            .try_collect()
-            .await
-    }
-
-    /// The maximally flexible version of [`query`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`query`]: #method.query
-    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::query(&self.inner, statement, params).await
-    }
-
-    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
-    /// to save a roundtrip
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
-    where
-        S: AsRef<str>,
-        I: IntoIterator<Item = Option<S>>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        query::query_txt(&self.inner, statement, params).await
-    }
-
-    /// Executes a statement, returning the number of rows modified.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn execute<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.execute_raw(statement, slice_iter(params)).await
-    }
-
-    /// The maximally flexible version of [`execute`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`execute`]: #method.execute
-    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::execute(self.inner(), statement, params).await
-    }
-
-    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
-    ///
-    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
-    /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings,
-    /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the
-    /// rows, this method returns a list of an enum which indicates either the completion of one of the commands,
-    /// or a row of data. This preserves the framing between the separate statements in the request.
-    ///
-    /// # Warning
-    ///
-    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
-    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
-    /// them to this method!
-    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
-        self.simple_query_raw(query).await?.try_collect().await
-    }
-
-    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
-        simple_query::simple_query(self.inner(), query).await
-    }
-
    /// Executes a sequence of SQL statements using the simple query protocol.
    ///
    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
@@ -350,8 +190,8 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
-        simple_query::batch_execute(self.inner(), query).await
+    pub async fn batch_execute(&mut self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+        simple_query::batch_execute(&mut self.inner, query).await
    }

    /// Begins a new database transaction.
@@ -359,7 +199,7 @@ impl Client {
    /// The transaction will roll back by default - use the `commit` method to commit it.
    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
        struct RollbackIfNotDone<'me> {
-            client: &'me Client,
+            client: &'me mut Client,
            done: bool,
        }

@@ -369,13 +209,13 @@ impl Client {
                    return;
                }

-                let buf = self.client.inner().with_buf(|buf| {
+                let buf = self.client.inner.with_buf(|buf| {
                    frontend::query("ROLLBACK", buf).unwrap();
                    buf.split().freeze()
                });
                let _ = self
                    .client
-                    .inner()
+                    .inner
                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
            }
        }
@@ -390,7 +230,7 @@ impl Client {
                client: self,
                done: false,
            };
-            self.batch_execute("BEGIN").await?;
+            cleaner.client.batch_execute("BEGIN").await?;
            cleaner.done = true;
        }

@@ -416,11 +256,6 @@ impl Client {
        }
    }

-    /// Query for type information
-    pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        crate::prepare::get_type(&self.inner, oid).await
-    }
-
    /// Determines if the connection to the server has already closed.
    ///
    /// In that case, all future queries will fail.
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,4 +1,4 @@
-use crate::query::RowStream;
+use crate::query::{self, RowStream};
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
 use async_trait::async_trait;
@@ -13,33 +13,32 @@ mod private {
 /// This trait is "sealed", and cannot be implemented outside of this crate.
 #[async_trait]
 pub trait GenericClient: private::Sealed {
-    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
        I::IntoIter: ExactSizeIterator + Sync + Send;

    /// Query for type information
-    async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
+    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
 }

 impl private::Sealed for Client {}

 #[async_trait]
 impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
        I::IntoIter: ExactSizeIterator + Sync + Send,
    {
-        self.query_raw_txt(statement, params).await
+        query::query_txt(&mut self.inner, statement, params).await
    }

    /// Query for type information
-    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.get_type(oid).await
+    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
+        crate::prepare::get_type(&mut self.inner, &mut self.cached_typeinfo, oid).await
    }
 }

@@ -48,17 +47,18 @@ impl private::Sealed for Transaction<'_> {}
 #[async_trait]
 #[allow(clippy::needless_lifetimes)]
 impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
        I::IntoIter: ExactSizeIterator + Sync + Send,
    {
-        self.query_raw_txt(statement, params).await
+        query::query_txt(&mut self.client().inner, statement, params).await
    }

    /// Query for type information
-    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.client().get_type(oid).await
+    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
+        let client = self.client();
+        crate::prepare::get_type(&mut client.inner, &mut client.cached_typeinfo, oid).await
    }
 }
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -10,11 +10,10 @@ use crate::error::DbError;
 pub use crate::error::Error;
 pub use crate::generic_client::GenericClient;
 pub use crate::query::RowStream;
-pub use crate::row::{Row, SimpleQueryRow};
-pub use crate::simple_query::SimpleQueryStream;
+pub use crate::row::Row;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::to_statement::ToStatement;
+// pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
@@ -65,7 +64,7 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod to_statement;
+// mod to_statement;
 mod transaction;
 mod transaction_builder;
 pub mod types;
@@ -98,7 +97,6 @@ impl Notification {
 /// An asynchronous message from the server.
 #[allow(clippy::large_enum_variant)]
 #[derive(Debug, Clone)]
-#[non_exhaustive]
 pub enum AsyncMessage {
    /// A notice.
    ///
@@ -110,18 +108,6 @@ pub enum AsyncMessage {
    Notification(Notification),
 }

-/// Message returned by the `SimpleQuery` stream.
-#[derive(Debug)]
-#[non_exhaustive]
-pub enum SimpleQueryMessage {
-    /// A row of data.
-    Row(SimpleQueryRow),
-    /// A statement in the query has completed.
-    ///
-    /// The number of rows modified or selected is returned.
-    CommandComplete(u64),
-}
-
 fn slice_iter<'a>(
    s: &'a [&'a (dyn ToSql + Sync)],
 ) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,4 +1,4 @@
-use crate::client::InnerClient;
+use crate::client::{CachedTypeInfo, InnerClient};
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
 use crate::error::SqlState;
@@ -7,14 +7,13 @@ use crate::{query, slice_iter};
 use crate::{Column, Error, Statement};
 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
-use futures_util::{pin_mut, TryStreamExt};
+use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use log::debug;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use std::future::Future;
-use std::pin::Pin;
+use std::pin::{pin, Pin};
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;

 pub(crate) const TYPEINFO_QUERY: &str = "\
 SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
@@ -59,7 +58,8 @@ ORDER BY attnum
 static NEXT_ID: AtomicUsize = AtomicUsize::new(0);

 pub async fn prepare(
-    client: &Arc<InnerClient>,
+    client: &mut InnerClient,
+    cache: &mut CachedTypeInfo,
    query: &str,
    types: &[Type],
 ) -> Result<Statement, Error> {
@@ -86,7 +86,7 @@ pub async fn prepare(
    let mut parameters = vec![];
    let mut it = parameter_description.parameters();
    while let Some(oid) = it.next().map_err(Error::parse)? {
-        let type_ = get_type(client, oid).await?;
+        let type_ = get_type(client, cache, oid).await?;
        parameters.push(type_);
    }

@@ -94,24 +94,30 @@ pub async fn prepare(
    if let Some(row_description) = row_description {
        let mut it = row_description.fields();
        while let Some(field) = it.next().map_err(Error::parse)? {
-            let type_ = get_type(client, field.type_oid()).await?;
+            let type_ = get_type(client, cache, field.type_oid()).await?;
            let column = Column::new(field.name().to_string(), type_, field);
            columns.push(column);
        }
    }

-    Ok(Statement::new(client, name, parameters, columns))
+    Ok(Statement::new(name, parameters, columns))
 }

 fn prepare_rec<'a>(
-    client: &'a Arc<InnerClient>,
+    client: &'a mut InnerClient,
+    cache: &'a mut CachedTypeInfo,
    query: &'a str,
    types: &'a [Type],
 ) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
-    Box::pin(prepare(client, query, types))
+    Box::pin(prepare(client, cache, query, types))
 }

-fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
+fn encode(
+    client: &mut InnerClient,
+    name: &str,
+    query: &str,
+    types: &[Type],
+) -> Result<Bytes, Error> {
    if types.is_empty() {
        debug!("preparing query {}: {}", name, query);
    } else {
@@ -126,16 +132,20 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu
    })
 }

-pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
+pub async fn get_type(
+    client: &mut InnerClient,
+    cache: &mut CachedTypeInfo,
+    oid: Oid,
+) -> Result<Type, Error> {
    if let Some(type_) = Type::from_oid(oid) {
        return Ok(type_);
    }

-    if let Some(type_) = client.type_(oid) {
+    if let Some(type_) = cache.type_(oid) {
        return Ok(type_);
    }

-    let stmt = typeinfo_statement(client).await?;
+    let stmt = typeinfo_statement(client, cache).await?;

    let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
    pin_mut!(rows);
@@ -145,118 +155,141 @@ pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error
        None => return Err(Error::unexpected_message()),
    };

-    let name: String = row.try_get(0)?;
-    let type_: i8 = row.try_get(1)?;
-    let elem_oid: Oid = row.try_get(2)?;
-    let rngsubtype: Option<Oid> = row.try_get(3)?;
-    let basetype: Oid = row.try_get(4)?;
-    let schema: String = row.try_get(5)?;
-    let relid: Oid = row.try_get(6)?;
+    let name: String = row.try_get(stmt.columns(), 0)?;
+    let type_: i8 = row.try_get(stmt.columns(), 1)?;
+    let elem_oid: Oid = row.try_get(stmt.columns(), 2)?;
+    let rngsubtype: Option<Oid> = row.try_get(stmt.columns(), 3)?;
+    let basetype: Oid = row.try_get(stmt.columns(), 4)?;
+    let schema: String = row.try_get(stmt.columns(), 5)?;
+    let relid: Oid = row.try_get(stmt.columns(), 6)?;

    let kind = if type_ == b'e' as i8 {
-        let variants = get_enum_variants(client, oid).await?;
+        let variants = get_enum_variants(client, cache, oid).await?;
        Kind::Enum(variants)
    } else if type_ == b'p' as i8 {
        Kind::Pseudo
    } else if basetype != 0 {
-        let type_ = get_type_rec(client, basetype).await?;
+        let type_ = get_type_rec(client, cache, basetype).await?;
        Kind::Domain(type_)
    } else if elem_oid != 0 {
-        let type_ = get_type_rec(client, elem_oid).await?;
+        let type_ = get_type_rec(client, cache, elem_oid).await?;
        Kind::Array(type_)
    } else if relid != 0 {
-        let fields = get_composite_fields(client, relid).await?;
+        let fields = get_composite_fields(client, cache, relid).await?;
        Kind::Composite(fields)
    } else if let Some(rngsubtype) = rngsubtype {
-        let type_ = get_type_rec(client, rngsubtype).await?;
+        let type_ = get_type_rec(client, cache, rngsubtype).await?;
        Kind::Range(type_)
    } else {
        Kind::Simple
    };

    let type_ = Type::new(name, oid, kind, schema);
-    client.set_type(oid, &type_);
+    cache.set_type(oid, &type_);

    Ok(type_)
 }

 fn get_type_rec<'a>(
-    client: &'a Arc<InnerClient>,
+    client: &'a mut InnerClient,
+    cache: &'a mut CachedTypeInfo,
    oid: Oid,
 ) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
-    Box::pin(get_type(client, oid))
+    Box::pin(get_type(client, cache, oid))
 }

-async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
-    if let Some(stmt) = client.typeinfo() {
-        return Ok(stmt);
+async fn typeinfo_statement<'c>(
+    client: &mut InnerClient,
+    cache: &'c mut CachedTypeInfo,
+) -> Result<&'c Statement, Error> {
+    if cache.typeinfo().is_some() {
+        // needed to get around a borrow checker limitation
+        return Ok(cache.typeinfo().unwrap());
    }

-    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
+    let stmt = match prepare_rec(client, cache, TYPEINFO_QUERY, &[]).await {
        Ok(stmt) => stmt,
        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
-            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
+            prepare_rec(client, cache, TYPEINFO_FALLBACK_QUERY, &[]).await?
        }
        Err(e) => return Err(e),
    };

-    client.set_typeinfo(&stmt);
-    Ok(stmt)
+    Ok(cache.set_typeinfo(stmt))
 }

-async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
-    let stmt = typeinfo_enum_statement(client).await?;
+async fn get_enum_variants(
+    client: &mut InnerClient,
+    cache: &mut CachedTypeInfo,
+    oid: Oid,
+) -> Result<Vec<String>, Error> {
+    let stmt = typeinfo_enum_statement(client, cache).await?;

-    query::query(client, stmt, slice_iter(&[&oid]))
-        .await?
-        .and_then(|row| async move { row.try_get(0) })
-        .try_collect()
-        .await
+    let mut out = vec![];
+
+    let mut rows = pin!(query::query(client, stmt, slice_iter(&[&oid])).await?);
+    while let Some(row) = rows.next().await {
+        out.push(row?.try_get(stmt.columns(), 0)?)
+    }
+    Ok(out)
 }

-async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
-    if let Some(stmt) = client.typeinfo_enum() {
-        return Ok(stmt);
+async fn typeinfo_enum_statement<'c>(
+    client: &mut InnerClient,
+    cache: &'c mut CachedTypeInfo,
+) -> Result<&'c Statement, Error> {
+    if cache.typeinfo_enum().is_some() {
+        // needed to get around a borrow checker limitation
+        return Ok(cache.typeinfo_enum().unwrap());
    }

-    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
+    let stmt = match prepare_rec(client, cache, TYPEINFO_ENUM_QUERY, &[]).await {
        Ok(stmt) => stmt,
        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
-            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
+            prepare_rec(client, cache, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
        }
        Err(e) => return Err(e),
    };

-    client.set_typeinfo_enum(&stmt);
-    Ok(stmt)
+    Ok(cache.set_typeinfo_enum(stmt))
 }

-async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
-    let stmt = typeinfo_composite_statement(client).await?;
+async fn get_composite_fields(
+    client: &mut InnerClient,
+    cache: &mut CachedTypeInfo,
+    oid: Oid,
+) -> Result<Vec<Field>, Error> {
+    let stmt = typeinfo_composite_statement(client, cache).await?;

-    let rows = query::query(client, stmt, slice_iter(&[&oid]))
-        .await?
-        .try_collect::<Vec<_>>()
-        .await?;
+    let mut rows = pin!(query::query(client, stmt, slice_iter(&[&oid])).await?);
+
+    let mut oids = vec![];
+    while let Some(row) = rows.next().await {
+        let row = row?;
+        let name = row.try_get(stmt.columns(), 0)?;
+        let oid = row.try_get(stmt.columns(), 1)?;
+        oids.push((name, oid));
+    }

    let mut fields = vec![];
-    for row in rows {
-        let name = row.try_get(0)?;
-        let oid = row.try_get(1)?;
-        let type_ = get_type_rec(client, oid).await?;
+    for (name, oid) in oids {
+        let type_ = get_type_rec(client, cache, oid).await?;
        fields.push(Field::new(name, type_));
    }

    Ok(fields)
 }

-async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
-    if let Some(stmt) = client.typeinfo_composite() {
-        return Ok(stmt);
+async fn typeinfo_composite_statement<'c>(
+    client: &mut InnerClient,
+    cache: &'c mut CachedTypeInfo,
+) -> Result<&'c Statement, Error> {
+    if cache.typeinfo_composite().is_some() {
+        // needed to get around a borrow checker limitation
+        return Ok(cache.typeinfo_composite().unwrap());
    }

-    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+    let stmt = prepare_rec(client, cache, TYPEINFO_COMPOSITE_QUERY, &[]).await?;

-    client.set_typeinfo_composite(&stmt);
-    Ok(stmt)
+    Ok(cache.set_typeinfo_composite(stmt))
 }
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -14,7 +14,6 @@ use postgres_types2::{Format, ToSql, Type};
 use std::fmt;
 use std::marker::PhantomPinned;
 use std::pin::Pin;
-use std::sync::Arc;
 use std::task::{Context, Poll};

 struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
@@ -26,10 +25,10 @@ impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
 }

 pub async fn query<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
+    client: &mut InnerClient,
+    statement: &Statement,
    params: I,
-) -> Result<RowStream, Error>
+) -> Result<RawRowStream, Error>
 where
    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
    I::IntoIter: ExactSizeIterator,
@@ -41,13 +40,12 @@ where
            statement.name(),
            BorrowToSqlParamsDebug(params.as_slice()),
        );
-        encode(client, &statement, params)?
+        encode(client, statement, params)?
    } else {
-        encode(client, &statement, params)?
+        encode(client, statement, params)?
    };
    let responses = start(client, buf).await?;
-    Ok(RowStream {
-        statement,
+    Ok(RawRowStream {
        responses,
        command_tag: None,
        status: ReadyForQueryStatus::Unknown,
@@ -57,7 +55,7 @@ where
 }

 pub async fn query_txt<S, I>(
-    client: &Arc<InnerClient>,
+    client: &mut InnerClient,
    query: &str,
    params: I,
 ) -> Result<RowStream, Error>
@@ -157,49 +155,6 @@ where
    })
 }

-pub async fn execute<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<u64, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if log_enabled!(Level::Debug) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let mut responses = start(client, buf).await?;
-
-    let mut rows = 0;
-    loop {
-        match responses.next().await? {
-            Message::DataRow(_) => {}
-            Message::CommandComplete(body) => {
-                rows = body
-                    .tag()
-                    .map_err(Error::parse)?
-                    .rsplit(' ')
-                    .next()
-                    .unwrap()
-                    .parse()
-                    .unwrap_or(0);
-            }
-            Message::EmptyQueryResponse => rows = 0,
-            Message::ReadyForQuery(_) => return Ok(rows),
-            _ => return Err(Error::unexpected_message()),
-        }
-    }
-}
-
 async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

@@ -211,7 +166,11 @@ async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
    Ok(responses)
 }

-pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
+pub fn encode<'a, I>(
+    client: &mut InnerClient,
+    statement: &Statement,
+    params: I,
+) -> Result<Bytes, Error>
 where
    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
    I::IntoIter: ExactSizeIterator,
@@ -296,11 +255,7 @@ impl Stream for RowStream {
        loop {
            match ready!(this.responses.poll_next(cx)?) {
                Message::DataRow(body) => {
-                    return Poll::Ready(Some(Ok(Row::new(
-                        this.statement.clone(),
-                        body,
-                        *this.output_format,
-                    )?)))
+                    return Poll::Ready(Some(Ok(Row::new(body, *this.output_format)?)))
                }
                Message::EmptyQueryResponse | Message::PortalSuspended => {}
                Message::CommandComplete(body) => {
@@ -338,3 +293,41 @@ impl RowStream {
        self.status
    }
 }
+
+pin_project! {
+    /// A stream of table rows.
+    pub struct RawRowStream {
+        responses: Responses,
+        command_tag: Option<String>,
+        output_format: Format,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
+}
+
+impl Stream for RawRowStream {
+    type Item = Result<Row, Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+        loop {
+            match ready!(this.responses.poll_next(cx)?) {
+                Message::DataRow(body) => {
+                    return Poll::Ready(Some(Ok(Row::new(body, *this.output_format)?)))
+                }
+                Message::EmptyQueryResponse | Message::PortalSuspended => {}
+                Message::CommandComplete(body) => {
+                    if let Ok(tag) = body.tag() {
+                        *this.command_tag = Some(tag.to_string());
+                    }
+                }
+                Message::ReadyForQuery(status) => {
+                    *this.status = status.into();
+                    return Poll::Ready(None);
+                }
+                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+            }
+        }
+    }
+}
--- a/libs/proxy/tokio-postgres2/src/row.rs
+++ b/libs/proxy/tokio-postgres2/src/row.rs
@@ -1,103 +1,16 @@
 //! Rows.
-
-use crate::row::sealed::{AsName, Sealed};
-use crate::simple_query::SimpleColumn;
 use crate::statement::Column;
 use crate::types::{FromSql, Type, WrongType};
-use crate::{Error, Statement};
+use crate::Error;
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend::DataRowBody;
 use postgres_types2::{Format, WrongFormat};
 use std::fmt;
 use std::ops::Range;
 use std::str;
-use std::sync::Arc;
-
-mod sealed {
-    pub trait Sealed {}
-
-    pub trait AsName {
-        fn as_name(&self) -> &str;
-    }
-}
-
-impl AsName for Column {
-    fn as_name(&self) -> &str {
-        self.name()
-    }
-}
-
-impl AsName for String {
-    fn as_name(&self) -> &str {
-        self
-    }
-}
-
-/// A trait implemented by types that can index into columns of a row.
-///
-/// This cannot be implemented outside of this crate.
-pub trait RowIndex: Sealed {
-    #[doc(hidden)]
-    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
-    where
-        T: AsName;
-}
-
-impl Sealed for usize {}
-
-impl RowIndex for usize {
-    #[inline]
-    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
-    where
-        T: AsName,
-    {
-        if *self >= columns.len() {
-            None
-        } else {
-            Some(*self)
-        }
-    }
-}
-
-impl Sealed for str {}
-
-impl RowIndex for str {
-    #[inline]
-    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
-    where
-        T: AsName,
-    {
-        if let Some(idx) = columns.iter().position(|d| d.as_name() == self) {
-            return Some(idx);
-        };
-
-        // FIXME ASCII-only case insensitivity isn't really the right thing to
-        // do. Postgres itself uses a dubious wrapper around tolower and JDBC
-        // uses the US locale.
-        columns
-            .iter()
-            .position(|d| d.as_name().eq_ignore_ascii_case(self))
-    }
-}
-
-impl<T> Sealed for &T where T: ?Sized + Sealed {}
-
-impl<T> RowIndex for &T
-where
-    T: ?Sized + RowIndex,
-{
-    #[inline]
-    fn __idx<U>(&self, columns: &[U]) -> Option<usize>
-    where
-        U: AsName,
-    {
-        T::__idx(*self, columns)
-    }
-}

 /// A row of data returned from the database by a query.
 pub struct Row {
-    statement: Statement,
    output_format: Format,
    body: DataRowBody,
    ranges: Vec<Option<Range<usize>>>,
@@ -105,80 +18,33 @@ pub struct Row {

 impl fmt::Debug for Row {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("Row")
-            .field("columns", &self.columns())
-            .finish()
+        f.debug_struct("Row").finish()
    }
 }

 impl Row {
    pub(crate) fn new(
-        statement: Statement,
+        // statement: Statement,
        body: DataRowBody,
        output_format: Format,
    ) -> Result<Row, Error> {
        let ranges = body.ranges().collect().map_err(Error::parse)?;
        Ok(Row {
-            statement,
            body,
            ranges,
            output_format,
        })
    }

-    /// Returns information about the columns of data in the row.
-    pub fn columns(&self) -> &[Column] {
-        self.statement.columns()
-    }
-
-    /// Determines if the row contains no values.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Returns the number of values in the row.
-    pub fn len(&self) -> usize {
-        self.columns().len()
-    }
-
-    /// Deserializes a value from the row.
-    ///
-    /// The value can be specified either by its numeric index in the row, or by its column name.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
-    pub fn get<'a, I, T>(&'a self, idx: I) -> T
+    pub(crate) fn try_get<'a, T>(&'a self, columns: &[Column], idx: usize) -> Result<T, Error>
    where
-        I: RowIndex + fmt::Display,
        T: FromSql<'a>,
    {
-        match self.get_inner(&idx) {
-            Ok(ok) => ok,
-            Err(err) => panic!("error retrieving column {}: {}", idx, err),
-        }
-    }
-
-    /// Like `Row::get`, but returns a `Result` rather than panicking.
-    pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result<T, Error>
-    where
-        I: RowIndex + fmt::Display,
-        T: FromSql<'a>,
-    {
-        self.get_inner(&idx)
-    }
-
-    fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result<T, Error>
-    where
-        I: RowIndex + fmt::Display,
-        T: FromSql<'a>,
-    {
-        let idx = match idx.__idx(self.columns()) {
-            Some(idx) => idx,
-            None => return Err(Error::column(idx.to_string())),
+        let Some(column) = columns.get(idx) else {
+            return Err(Error::column(idx.to_string()));
        };

-        let ty = self.columns()[idx].type_();
+        let ty = column.type_();
        if !T::accepts(ty) {
            return Err(Error::from_sql(
                Box::new(WrongType::new::<T>(ty.clone())),
@@ -216,85 +82,3 @@ impl Row {
        self.body.buffer().len()
    }
 }
-
-impl AsName for SimpleColumn {
-    fn as_name(&self) -> &str {
-        self.name()
-    }
-}
-
-/// A row of data returned from the database by a simple query.
-#[derive(Debug)]
-pub struct SimpleQueryRow {
-    columns: Arc<[SimpleColumn]>,
-    body: DataRowBody,
-    ranges: Vec<Option<Range<usize>>>,
-}
-
-impl SimpleQueryRow {
-    #[allow(clippy::new_ret_no_self)]
-    pub(crate) fn new(
-        columns: Arc<[SimpleColumn]>,
-        body: DataRowBody,
-    ) -> Result<SimpleQueryRow, Error> {
-        let ranges = body.ranges().collect().map_err(Error::parse)?;
-        Ok(SimpleQueryRow {
-            columns,
-            body,
-            ranges,
-        })
-    }
-
-    /// Returns information about the columns of data in the row.
-    pub fn columns(&self) -> &[SimpleColumn] {
-        &self.columns
-    }
-
-    /// Determines if the row contains no values.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Returns the number of values in the row.
-    pub fn len(&self) -> usize {
-        self.columns.len()
-    }
-
-    /// Returns a value from the row.
-    ///
-    /// The value can be specified either by its numeric index in the row, or by its column name.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
-    pub fn get<I>(&self, idx: I) -> Option<&str>
-    where
-        I: RowIndex + fmt::Display,
-    {
-        match self.get_inner(&idx) {
-            Ok(ok) => ok,
-            Err(err) => panic!("error retrieving column {}: {}", idx, err),
-        }
-    }
-
-    /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking.
-    pub fn try_get<I>(&self, idx: I) -> Result<Option<&str>, Error>
-    where
-        I: RowIndex + fmt::Display,
-    {
-        self.get_inner(&idx)
-    }
-
-    fn get_inner<I>(&self, idx: &I) -> Result<Option<&str>, Error>
-    where
-        I: RowIndex + fmt::Display,
-    {
-        let idx = match idx.__idx(&self.columns) {
-            Some(idx) => idx,
-            None => return Err(Error::column(idx.to_string())),
-        };
-
-        let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]);
-        FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx))
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -1,52 +1,14 @@
-use crate::client::{InnerClient, Responses};
+use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
+use crate::{Error, ReadyForQueryStatus};
 use bytes::Bytes;
-use fallible_iterator::FallibleIterator;
-use futures_util::{ready, Stream};
 use log::debug;
-use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
-use std::marker::PhantomPinned;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-/// Information about a column of a single query row.
-#[derive(Debug)]
-pub struct SimpleColumn {
-    name: String,
-}
-
-impl SimpleColumn {
-    pub(crate) fn new(name: String) -> SimpleColumn {
-        SimpleColumn { name }
-    }
-
-    /// Returns the name of the column.
-    pub fn name(&self) -> &str {
-        &self.name
-    }
-}
-
-pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
-    debug!("executing simple query: {}", query);
-
-    let buf = encode(client, query)?;
-    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
-
-    Ok(SimpleQueryStream {
-        responses,
-        columns: None,
-        status: ReadyForQueryStatus::Unknown,
-        _p: PhantomPinned,
-    })
-}

 pub async fn batch_execute(
-    client: &InnerClient,
+    client: &mut InnerClient,
    query: &str,
 ) -> Result<ReadyForQueryStatus, Error> {
    debug!("executing statement batch: {}", query);
@@ -66,77 +28,9 @@ pub async fn batch_execute(
    }
 }

-pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
+pub(crate) fn encode(client: &mut InnerClient, query: &str) -> Result<Bytes, Error> {
    client.with_buf(|buf| {
        frontend::query(query, buf).map_err(Error::encode)?;
        Ok(buf.split().freeze())
    })
 }
-
-pin_project! {
-    /// A stream of simple query results.
-    pub struct SimpleQueryStream {
-        responses: Responses,
-        columns: Option<Arc<[SimpleColumn]>>,
-        status: ReadyForQueryStatus,
-        #[pin]
-        _p: PhantomPinned,
-    }
-}
-
-impl SimpleQueryStream {
-    /// Returns if the connection is ready for querying, with the status of the connection.
-    ///
-    /// This might be available only after the stream has been exhausted.
-    pub fn ready_status(&self) -> ReadyForQueryStatus {
-        self.status
-    }
-}
-
-impl Stream for SimpleQueryStream {
-    type Item = Result<SimpleQueryMessage, Error>;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        let this = self.project();
-        loop {
-            match ready!(this.responses.poll_next(cx)?) {
-                Message::CommandComplete(body) => {
-                    let rows = body
-                        .tag()
-                        .map_err(Error::parse)?
-                        .rsplit(' ')
-                        .next()
-                        .unwrap()
-                        .parse()
-                        .unwrap_or(0);
-                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows))));
-                }
-                Message::EmptyQueryResponse => {
-                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0))));
-                }
-                Message::RowDescription(body) => {
-                    let columns = body
-                        .fields()
-                        .map(|f| Ok(SimpleColumn::new(f.name().to_string())))
-                        .collect::<Vec<_>>()
-                        .map_err(Error::parse)?
-                        .into();
-
-                    *this.columns = Some(columns);
-                }
-                Message::DataRow(body) => {
-                    let row = match &this.columns {
-                        Some(columns) => SimpleQueryRow::new(columns.clone(), body)?,
-                        None => return Poll::Ready(Some(Err(Error::unexpected_message()))),
-                    };
-                    return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row))));
-                }
-                Message::ReadyForQuery(s) => {
-                    *this.status = s.into();
-                    return Poll::Ready(None);
-                }
-                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
-            }
-        }
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -1,64 +1,33 @@
-use crate::client::InnerClient;
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
 use crate::types::Type;
-use postgres_protocol2::{
-    message::{backend::Field, frontend},
-    Oid,
-};
-use std::{
-    fmt,
-    sync::{Arc, Weak},
-};
+use postgres_protocol2::{message::backend::Field, Oid};
+use std::fmt;

 struct StatementInner {
-    client: Weak<InnerClient>,
    name: String,
    params: Vec<Type>,
    columns: Vec<Column>,
 }

-impl Drop for StatementInner {
-    fn drop(&mut self) {
-        if let Some(client) = self.client.upgrade() {
-            let buf = client.with_buf(|buf| {
-                frontend::close(b'S', &self.name, buf).unwrap();
-                frontend::sync(buf);
-                buf.split().freeze()
-            });
-            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
-        }
-    }
-}
-
 /// A prepared statement.
 ///
 /// Prepared statements can only be used with the connection that created them.
-#[derive(Clone)]
-pub struct Statement(Arc<StatementInner>);
+pub struct Statement(StatementInner);

 impl Statement {
-    pub(crate) fn new(
-        inner: &Arc<InnerClient>,
-        name: String,
-        params: Vec<Type>,
-        columns: Vec<Column>,
-    ) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Arc::downgrade(inner),
+    pub(crate) fn new(name: String, params: Vec<Type>, columns: Vec<Column>) -> Statement {
+        Statement(StatementInner {
            name,
            params,
            columns,
-        }))
+        })
    }

    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Weak::new(),
+        Statement(StatementInner {
            name: String::new(),
            params,
            columns,
-        }))
+        })
    }

    pub(crate) fn name(&self) -> &str {
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -1,57 +0,0 @@
-use crate::to_statement::private::{Sealed, ToStatementType};
-use crate::Statement;
-
-mod private {
-    use crate::{Client, Error, Statement};
-
-    pub trait Sealed {}
-
-    pub enum ToStatementType<'a> {
-        Statement(&'a Statement),
-        Query(&'a str),
-    }
-
-    impl<'a> ToStatementType<'a> {
-        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
-            match self {
-                ToStatementType::Statement(s) => Ok(s.clone()),
-                ToStatementType::Query(s) => client.prepare(s).await,
-            }
-        }
-    }
-}
-
-/// A trait abstracting over prepared and unprepared statements.
-///
-/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
-/// was prepared previously.
-///
-/// This trait is "sealed" and cannot be implemented by anything outside this crate.
-pub trait ToStatement: Sealed {
-    #[doc(hidden)]
-    fn __convert(&self) -> ToStatementType<'_>;
-}
-
-impl ToStatement for Statement {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Statement(self)
-    }
-}
-
-impl Sealed for Statement {}
-
-impl ToStatement for str {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for str {}
-
-impl ToStatement for String {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for String {}
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,6 +1,5 @@
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::query::RowStream;
 use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
 use postgres_protocol2::message::frontend;

@@ -19,13 +18,13 @@ impl Drop for Transaction<'_> {
            return;
        }

-        let buf = self.client.inner().with_buf(|buf| {
+        let buf = self.client.inner.with_buf(|buf| {
            frontend::query("ROLLBACK", buf).unwrap();
            buf.split().freeze()
        });
        let _ = self
            .client
-            .inner()
+            .inner
            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
    }
 }
@@ -52,23 +51,13 @@ impl<'a> Transaction<'a> {
        self.client.batch_execute("ROLLBACK").await
    }

-    /// Like `Client::query_raw_txt`.
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
-    where
-        S: AsRef<str>,
-        I: IntoIterator<Item = Option<S>>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        self.client.query_raw_txt(statement, params).await
-    }
-
    /// Like `Client::cancel_token`.
    pub fn cancel_token(&self) -> CancelToken {
        self.client.cancel_token()
    }

    /// Returns a reference to the underlying `Client`.
-    pub fn client(&self) -> &Client {
+    pub fn client(&mut self) -> &mut Client {
        self.client
    }
 }
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
    }
 }

+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -1978,6 +1978,26 @@ async fn timeline_gc_handler(
    json_response(StatusCode::OK, gc_result)
 }

+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    mut request: Request<Body>,
@@ -1987,7 +2007,7 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;

    let state = get_state(&request);

@@ -2012,22 +2032,42 @@ async fn timeline_compact_handler(
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
    let options = CompactOptions {
-        compact_range,
+        compact_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_range.clone()),
+        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
        flags,
    };

+    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
        }
        json_response(StatusCode::OK, ())
    }
@@ -2108,16 +2148,20 @@ async fn timeline_checkpoint_handler(
    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);

+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                match e {
                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                    other => ApiError::InternalServerError(other.into()),
@@ -3301,6 +3345,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
            |r| api_handler(r, timeline_compact_handler),
        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_pitr_history_size",
@@ -2394,7 +2412,8 @@ pub(crate) struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
    pub(crate) layer_size_image: UIntGauge,
@@ -2475,7 +2494,11 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

@@ -2578,7 +2601,8 @@ impl TimelineMetrics {
            garbage_collect_histo,
            find_gc_cutoffs_histo,
            load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
            layer_size_image,
@@ -2642,6 +2666,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
@@ -2805,6 +2830,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }

 impl RemoteTimelineClientMetrics {
@@ -2819,6 +2845,10 @@ impl RemoteTimelineClientMetrics {
                .unwrap(),
        );

+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id_str,
            shard_id: shard_id_str,
@@ -2827,6 +2857,7 @@ impl RemoteTimelineClientMetrics {
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
        }
    }

@@ -3040,6 +3071,7 @@ impl Drop for RemoteTimelineClientMetrics {
            calls,
            bytes_started_counter,
            bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
        } = self;
        for ((a, b), _) in calls.get_mut().unwrap().drain() {
            let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3101,14 @@ impl Drop for RemoteTimelineClientMetrics {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,14 +37,18 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::CompactFlags;
+use timeline::CompactOptions;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -339,6 +343,11 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -2953,27 +2962,68 @@ impl Tenant {

        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
+            // pending_task_left == None: cannot compact, maybe still pending tasks
+            // pending_task_left == Some(true): compaction task left
+            // pending_task_left == Some(false): no compaction task left
            let pending_task_left = if *can_compact {
-                Some(
-                    timeline
-                        .compact(cancel, EnumSet::empty(), ctx)
-                        .instrument(info_span!("compact_timeline", %timeline_id))
-                        .await
-                        .inspect_err(|e| match e {
-                            timeline::CompactionError::ShuttingDown => (),
-                            timeline::CompactionError::Offload(_) => {
-                                // Failures to offload timelines do not trip the circuit breaker, because
-                                // they do not do lots of writes the way compaction itself does: it is cheap
-                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                let has_pending_l0_compaction_task = timeline
+                    .compact(cancel, EnumSet::empty(), ctx)
+                    .instrument(info_span!("compact_timeline", %timeline_id))
+                    .await
+                    .inspect_err(|e| match e {
+                        timeline::CompactionError::ShuttingDown => (),
+                        timeline::CompactionError::Offload(_) => {
+                            // Failures to offload timelines do not trip the circuit breaker, because
+                            // they do not do lots of writes the way compaction itself does: it is cheap
+                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                        }
+                        timeline::CompactionError::Other(e) => {
+                            self.compaction_circuit_breaker
+                                .lock()
+                                .unwrap()
+                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                        }
+                    })?;
+                if has_pending_l0_compaction_task {
+                    Some(true)
+                } else {
+                    let has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
+                    };
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
                            }
-                            timeline::CompactionError::Other(e) => {
-                                self.compaction_circuit_breaker
-                                    .lock()
-                                    .unwrap()
-                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                            }
-                        })?,
-                )
+                        }
+                    }
+                    Some(has_pending_scheduled_compaction_task)
+                }
            } else {
                None
            };
@@ -2993,6 +3043,36 @@ impl Tenant {
        Ok(has_pending_task)
    }

+    /// Cancel scheduled compaction tasks
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
+        }
+    }
+
+    /// Schedule a compaction task for a timeline.
+    pub(crate) async fn schedule_compaction(
+        &self,
+        timeline_id: TimelineId,
+        options: CompactOptions,
+    ) -> tokio::sync::oneshot::Receiver<()> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+        });
+        rx
+    }
+
    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
    // this happens during ingest: this background housekeeping is for freezing layers
    // that are open but haven't been written to for some time.
@@ -4005,6 +4085,7 @@ impl Tenant {
                // use an extremely long backoff.
                Some(Duration::from_secs(3600 * 24)),
            )),
+            scheduled_compaction_tasks: Mutex::new(Default::default()),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
@@ -9163,6 +9244,7 @@ mod tests {
                CompactOptions {
                    flags: dryrun_flags,
                    compact_range: None,
+                    compact_below_lsn: None,
                },
                &ctx,
            )
@@ -9399,6 +9481,7 @@ mod tests {
                CompactOptions {
                    flags: dryrun_flags,
                    compact_range: None,
+                    compact_below_lsn: None,
                },
                &ctx,
            )
@@ -9885,7 +9968,15 @@ mod tests {

        // Do a partial compaction on key range 0..2
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10015,15 @@ mod tests {

        // Do a partial compaction on key range 2..4
        tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10067,15 @@ mod tests {

        // Do a partial compaction on key range 4..9
        tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10118,15 @@ mod tests {

        // Do a partial compaction on key range 9..10
        tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10174,15 @@ mod tests {

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);

                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
    postgres_client::PostgresClientProtocol,
    sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
    Background,
 }

-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
@@ -777,6 +777,16 @@ pub(crate) enum CompactFlags {
    DryRun,
 }

+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRequest {
+    pub compact_range: Option<CompactRange>,
+    pub compact_below_lsn: Option<Lsn>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRange {
@@ -786,10 +796,24 @@ pub(crate) struct CompactRange {
    pub end: Key,
 }

-#[derive(Clone, Default)]
+impl From<Range<Key>> for CompactRange {
+    fn from(range: Range<Key>) -> Self {
+        CompactRange {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction.
    pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the LSN below this value.
+    /// This option is only used by GC compaction.
+    pub compact_below_lsn: Option<Lsn>,
 }

 impl std::fmt::Debug for Timeline {
@@ -1433,23 +1457,31 @@ impl Timeline {
        Ok(lease)
    }

-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
        self.freeze_and_flush0().await
    }

+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
        self.wait_flush_completion(token).await
    }

@@ -1604,6 +1636,7 @@ impl Timeline {
            CompactOptions {
                flags,
                compact_range: None,
+                compact_below_lsn: None,
            },
            ctx,
        )
@@ -2359,7 +2392,7 @@ impl Timeline {

            result
                .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                .set(disk_consistent_lsn.0 as i64);
            result
        })
@@ -3481,7 +3514,7 @@ impl Timeline {
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
        self.last_record_lsn.advance(new_lsn);
    }

@@ -3849,6 +3882,10 @@ impl Timeline {
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
        new_value != old_value
    }

@@ -5887,6 +5924,23 @@ impl<'a> TimelineWriter<'a> {
            return Ok(());
        }

+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
        let batch_max_lsn = batch.max_lsn;
        let buf_size: u64 = batch.buffer_size() as u64;

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,6 @@ use super::{

 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -64,6 +63,12 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+/// A scheduled compaction task.
+pub struct ScheduledCompactionTask {
+    pub options: CompactOptions,
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+}
+
 pub struct GcCompactionJobDescription {
    /// All layers to read in the compaction job
    selected_layers: Vec<Layer>,
@@ -1174,11 +1179,12 @@ impl Timeline {
                    .await
                    .map_err(CompactionError::Other)?;
            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
            }

            if !new_layers.is_empty() {
@@ -1746,24 +1752,6 @@ impl Timeline {
        Ok(())
    }

-    pub(crate) async fn compact_with_gc(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
-    }
-
    /// An experimental compaction building block that combines compaction with garbage collection.
    ///
    /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1771,17 +1759,19 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
@@ -1803,6 +1793,12 @@ impl Timeline {
        )
        .await?;

+        let flags = options.flags;
+        let compaction_key_range = options
+            .compact_range
+            .map(|range| range.start..range.end)
+            .unwrap_or_else(|| Key::MIN..Key::MAX);
+
        let dry_run = flags.contains(CompactFlags::DryRun);

        if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1822,18 @@ impl Timeline {
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
            for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                if lsn < &gc_cutoff {
                    retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1853,7 @@ impl Timeline {
                .map(|desc| desc.get_lsn_range().end)
                .max()
            else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                return Ok(());
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +1876,7 @@ impl Timeline {
                }
            }
            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
@@ -2048,6 +2055,11 @@ impl Timeline {
                // This is not handled in the filter iterator because shard is determined by hash.
                // Therefore, it does not give us any performance benefit to do things like skip
                // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                continue;
            }
            if !job_desc.compaction_key_range.contains(&key) {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                forknum: FSM_FORKNUM,
            };

+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
            let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
            let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                    .await?;
            }
@@ -617,7 +620,7 @@ impl WalIngest {
            // tail bits in the last remaining map page, representing truncated heap
            // blocks, need to be cleared. This is not only tidy, but also necessary
            // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
            if (trunc_byte != 0 || trunc_offs != 0)
                && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
            {
@@ -631,10 +634,9 @@ impl WalIngest {
                )?;
                vm_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                    .await?;
            }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -340,7 +340,7 @@ impl PoolingBackend {
            debug!("setting up backend session state");

            // initiates the auth session
-            if let Err(e) = client.execute("select auth.init()", &[]).await {
+            if let Err(e) = client.batch_execute("select auth.init();").await {
                discard.discard();
                return Err(e.into());
            }
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -11,7 +11,7 @@ use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, Instrument};
 #[cfg(test)]
 use {
    super::conn_pool_lib::GlobalConnPoolOptions,
@@ -125,13 +125,10 @@ pub(crate) fn poll_client<C: ClientInnerExt>(

                match message {
                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session_id, "notice: {}", notice);
+                        debug!(%session_id, "notice: {}", notice);
                    }
                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session_id, "unknown message");
+                        debug!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
                    }
                    Some(Err(e)) => {
                        error!(%session_id, "connection error: {}", e);
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,5 +1,5 @@
 use postgres_client::types::{Kind, Type};
-use postgres_client::Row;
+use postgres_client::{Column, Row};
 use serde_json::{Map, Value};

 //
@@ -77,14 +77,14 @@ pub(crate) enum JsonConversionError {
 //
 pub(crate) fn pg_text_row_to_json(
    row: &Row,
-    columns: &[Type],
+    columns: &[Column],
+    c_types: &[Type],
    raw_output: bool,
    array_mode: bool,
 ) -> Result<Value, JsonConversionError> {
-    let iter = row
-        .columns()
+    let iter = columns
        .iter()
-        .zip(columns)
+        .zip(c_types)
        .enumerate()
        .map(|(i, (column, typ))| {
            let name = column.name();
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -23,14 +23,13 @@ use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
-use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, Instrument};

 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
@@ -229,13 +228,10 @@ pub(crate) fn poll_client<C: ClientInnerExt>(

                match message {
                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session_id, "notice: {}", notice);
+                        debug!(%session_id, "notice: {}", notice);
                    }
                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session_id, "unknown message");
+                        debug!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
                    }
                    Some(Err(e)) => {
                        error!(%session_id, "connection error: {}", e);
@@ -287,12 +283,11 @@ impl ClientInnerCommon<postgres_client::Client> {
            let token = resign_jwt(&local_data.key, payload, local_data.jti)?;

            // initiates the auth session
-            self.inner.batch_execute("discard all").await?;
+            // the token contains only `[a-zA-Z1-9_-\.]+` so it cannot escape the string literal formatting.
            self.inner
-                .execute(
-                    "select auth.jwt_session_init($1)",
-                    &[&&*token as &(dyn ToSql + Sync)],
-                )
+                .batch_execute(&format!(
+                    "discard all; select auth.jwt_session_init('{token}');"
+                ))
                .await?;

            let pid = self.inner.get_process_id();
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -797,7 +797,13 @@ impl QueryData {
        let cancel_token = inner.cancel_token();

        let res = match select(
-            pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
+            pin!(query_to_json(
+                config,
+                &mut *inner,
+                self,
+                &mut 0,
+                parsed_headers
+            )),
            pin!(cancel.cancelled()),
        )
        .await
@@ -881,7 +887,7 @@ impl BatchQueryData {
            builder = builder.deferrable(true);
        }

-        let transaction = builder.start().await.inspect_err(|_| {
+        let mut transaction = builder.start().await.inspect_err(|_| {
            // if we cannot start a transaction, we should return immediately
            // and not return to the pool. connection is clearly broken
            discard.discard();
@@ -890,7 +896,7 @@ impl BatchQueryData {
        let json_output = match query_batch(
            config,
            cancel.child_token(),
-            &transaction,
+            &mut transaction,
            self,
            parsed_headers,
        )
@@ -934,7 +940,7 @@ impl BatchQueryData {
 async fn query_batch(
    config: &'static HttpConfig,
    cancel: CancellationToken,
-    transaction: &Transaction<'_>,
+    transaction: &mut Transaction<'_>,
    queries: BatchQueryData,
    parsed_headers: HttpHeaders,
 ) -> Result<String, SqlOverHttpError> {
@@ -972,7 +978,7 @@ async fn query_batch(

 async fn query_to_json<T: GenericClient>(
    config: &'static HttpConfig,
-    client: &T,
+    client: &mut T,
    data: QueryData,
    current_size: &mut usize,
    parsed_headers: HttpHeaders,
@@ -1027,7 +1033,7 @@ async fn query_to_json<T: GenericClient>(

    let columns_len = row_stream.columns().len();
    let mut fields = Vec::with_capacity(columns_len);
-    let mut columns = Vec::with_capacity(columns_len);
+    let mut c_types = Vec::with_capacity(columns_len);

    for c in row_stream.columns() {
        fields.push(json!({
@@ -1039,7 +1045,7 @@ async fn query_to_json<T: GenericClient>(
            "dataTypeModifier": c.type_modifier(),
            "format": "text",
        }));
-        columns.push(client.get_type(c.type_oid()).await?);
+        c_types.push(client.get_type(c.type_oid()).await?);
    }

    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
@@ -1047,7 +1053,15 @@ async fn query_to_json<T: GenericClient>(
    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
+        .map(|row| {
+            pg_text_row_to_json(
+                row,
+                row_stream.columns(),
+                &c_types,
+                parsed_headers.raw_output,
+                array_mode,
+            )
+        })
        .collect::<Result<Vec<_>, _>>()?;

    // Resulting JSON format is based on the format of node-postgres result.
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -789,7 +789,7 @@ impl Service {
            node_list_futs.push({
                async move {
                    tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(1);
+                    let timeout = Duration::from_secs(5);
                    let response = node
                        .with_client_retries(
                            |client| async move { client.list_location_config().await },
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -268,7 +268,7 @@ impl BucketConfig {
                config.bucket_name, config.bucket_region
            ),
            RemoteStorageKind::AzureContainer(config) => format!(
-                "bucket {}, storage account {:?}, region {}",
+                "container {}, storage account {:?}, region {}",
                config.container_name, config.storage_account, config.container_region
            ),
        }
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -152,6 +152,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_resident_physical_size",
    "pageserver_io_operations_bytes_total",
    "pageserver_last_record_lsn",
+    "pageserver_disk_consistent_lsn",
+    "pageserver_projected_remote_consistent_lsn",
    "pageserver_standby_horizon",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -850,6 +850,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        force_repartition=False,
        force_image_layer_creation=False,
        force_l0_compaction=False,
+        wait_until_flushed=True,
        wait_until_uploaded=False,
        compact: bool | None = None,
        **kwargs,
@@ -862,6 +863,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            query["force_image_layer_creation"] = "true"
        if force_l0_compaction:
            query["force_l0_compaction"] = "true"
+        if not wait_until_flushed:
+            query["wait_until_flushed"] = "false"
        if wait_until_uploaded:
            query["wait_until_uploaded"] = "true"

@@ -869,7 +872,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            query["compact"] = "true" if compact else "false"

        log.info(
-            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+            f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
        )
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload

-AGGRESIVE_COMPACTION_TENANT_CONF = {
+AGGRESSIVE_COMPACTION_TENANT_CONF = {
    # Disable gc and compaction. The test runs compaction manually.
    "gc_period": "0s",
    "compaction_period": "0s",
@@ -24,6 +24,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
    # Compact small layers
    "compaction_target_size": 1024**2,
    "image_creation_threshold": 2,
+    # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }


@@ -51,7 +52,7 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """

-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)

    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -120,14 +121,25 @@ page_cache_size=10
    assert vectored_average < 8


+@skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)

    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    row_count = 1000
-    churn_rounds = 10
+    row_count = 10000
+    churn_rounds = 50

    ps_http = env.pageserver.http_client()

@@ -141,20 +153,27 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
        if i % 10 == 0:
            log.info(f"Running churn round {i}/{churn_rounds} ...")

-        workload.churn_rows(row_count, env.pageserver.id)
-        # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
-        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
-        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
        ps_http.timeline_compact(
            tenant_id,
            timeline_id,
            enhanced_gc_bottom_most_compaction=True,
            body={
-                "start": "000000000000000000000000000000000000",
-                "end": "030000000000000000000000000000000000",
+                "scheduled": True,
+                "compact_range": {
+                    "start": "000000000000000000000000000000000000",
+                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
+                    "end": "010000000000000000000000000000000000",
+                },
            },
        )

+        workload.churn_rows(row_count, env.pageserver.id)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
    log.info("Validating at workload end ...")
    workload.validate(env.pageserver.id)