wip

Fix test build
pageserver: initial gRPC page service implementation
2026-02-08 13:10:37 +00:00 · 2025-06-01 13:33:58 +02:00 · 2025-05-29 12:00:40 +02:00 · 2025-05-28 18:10:29 +02:00
98 changed files with 2639 additions and 3604 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4236,7 +4236,6 @@ name = "pagebench"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-trait",
 "camino",
 "clap",
 "futures",
@@ -4245,15 +4244,12 @@ dependencies = [
 "humantime-serde",
 "pageserver_api",
 "pageserver_client",
- "pageserver_page_api",
 "rand 0.8.5",
 "reqwest",
 "serde",
 "serde_json",
 "tokio",
- "tokio-stream",
 "tokio-util",
- "tonic 0.13.1",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4469,6 +4465,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "prost 0.13.5",
+ "smallvec",
 "thiserror 1.0.69",
 "tonic 0.13.1",
 "tonic-build",
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -310,13 +310,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
    rustup component add llvm-tools rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} --locked && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} --locked && \
-    cargo install cargo-deny          --version ${CARGO_DENY_VERSION} --locked && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} --locked && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} --locked && \
-    cargo install cargo-chef          --version ${CARGO_CHEF_VERSION} --locked && \
-    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} --locked \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1180,14 +1180,14 @@ RUN cd exts/rag && \
 RUN cd exts/rag_bge_small_en_v15 && \
    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
-        REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/bge_small_en_v15.onnx \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control

 RUN cd exts/rag_jina_reranker_v1_tiny_en && \
    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
-        REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/jina_reranker_v1_tiny_en.onnx \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,6 +57,21 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
+    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
+        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
+
+    Ok(if arg.starts_with("http") {
+        arg
+    } else {
+        FALLBACK_PG_EXT_GATEWAY_BASE_URL
+    }
+    .to_owned())
+}
+
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -64,8 +79,9 @@ struct Cli {
    pub pgbin: String,

    /// The base URL for the remote extension storage proxy gateway.
-    #[arg(short = 'r', long)]
-    pub remote_ext_base_url: Option<Url>,
+    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
+    pub remote_ext_base_url: Option<String>,

    /// The port to bind the external listening HTTP server to. Clients running
    /// outside the compute will talk to the compute through this port. Keep
@@ -260,4 +276,18 @@ mod test {
    fn verify_cli() {
        Cli::command().debug_assert()
    }
+
+    #[test]
+    fn parse_pg_ext_gateway_base_url() {
+        let arg = "http://pg-ext-s3-gateway2";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(result, arg);
+
+        let arg = "pg-ext-s3-gateway";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(
+            result,
+            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
+        );
+    }
 }
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -339,8 +339,6 @@ async fn run_dump_restore(
    destination_connstring: String,
 ) -> Result<(), anyhow::Error> {
    let dumpdir = workdir.join("dumpdir");
-    let num_jobs = num_cpus::get().to_string();
-    info!("using {num_jobs} jobs for dump/restore");

    let common_args = [
        // schema mapping (prob suffices to specify them on one side)
@@ -356,7 +354,7 @@ async fn run_dump_restore(
        "directory".to_string(),
        // concurrency
        "--jobs".to_string(),
-        num_jobs,
+        num_cpus::get().to_string(),
        // progress updates
        "--verbose".to_string(),
    ];
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -31,7 +31,6 @@ use std::time::{Duration, Instant};
 use std::{env, fs};
 use tokio::spawn;
 use tracing::{Instrument, debug, error, info, instrument, warn};
-use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
@@ -97,7 +96,7 @@ pub struct ComputeNodeParams {
    pub internal_http_port: u16,

    /// the address of extension storage proxy gateway
-    pub remote_ext_base_url: Option<Url>,
+    pub remote_ext_base_url: Option<String>,

    /// Interval for installed extensions collection
    pub installed_extensions_collection_interval: u64,
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -83,7 +83,6 @@ use reqwest::StatusCode;
 use tar::Archive;
 use tracing::info;
 use tracing::log::warn;
-use url::Url;
 use zstd::stream::read::Decoder;

 use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
@@ -159,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_ext_base_url: &Url,
+    remote_ext_base_url: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);

    // TODO add retry logic
    let download_buffer =
-        match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
            Ok(buffer) => buffer,
            Err(error_message) => {
                return Err(anyhow::anyhow!(
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -107,7 +107,7 @@ impl<const N: usize> MetricType for HyperLogLogState<N> {
 }

 impl<const N: usize> HyperLogLogState<N> {
-    pub fn measure(&self, item: &(impl Hash + ?Sized)) {
+    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -27,7 +27,6 @@ pub use prometheus::{

 pub mod launch_timestamp;
 mod wrappers;
-pub use prometheus;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -181,7 +181,6 @@ pub struct ConfigToml {
    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
    pub ingest_batch_size: u64,
    pub max_vectored_read_bytes: MaxVectoredReadBytes,
-    pub max_get_vectored_keys: MaxGetVectoredKeys,
    pub image_compression: ImageCompressionAlgorithm,
    pub timeline_offloading: bool,
    pub ephemeral_bytes_per_memory_kb: usize,
@@ -230,7 +229,7 @@ pub enum PageServicePipeliningConfig {
 }
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PageServicePipeliningConfigPipelined {
-    /// Failed config parsing and validation if larger than `max_get_vectored_keys`.
+    /// Causes runtime errors if larger than max get_vectored batch size.
    pub max_batch_size: NonZeroUsize,
    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
    // The default below is such that new versions of the software can start
@@ -404,16 +403,6 @@ impl Default for EvictionOrder {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

-#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(transparent)]
-pub struct MaxGetVectoredKeys(NonZeroUsize);
-
-impl MaxGetVectoredKeys {
-    pub fn get(&self) -> usize {
-        self.0.get()
-    }
-}
-
 /// Tenant-level configuration values, used for various purposes.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(default)]
@@ -598,8 +587,6 @@ pub mod defaults {
    /// That is, slightly above 128 kB.
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB

-    pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32;
-
    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Zstd { level: Some(1) };

@@ -608,10 +595,7 @@ pub mod defaults {
    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
-        utils::postgres_client::PostgresClientProtocol::Interpreted {
-            format: utils::postgres_client::InterpretedFormat::Protobuf,
-            compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
-        };
+        utils::postgres_client::PostgresClientProtocol::Vanilla;

    pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
    pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
@@ -701,9 +685,6 @@ impl Default for ConfigToml {
            max_vectored_read_bytes: (MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
-            max_get_vectored_keys: (MaxGetVectoredKeys(
-                NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(),
-            )),
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            timeline_offloading: true,
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
@@ -732,9 +713,9 @@ impl Default for ConfigToml {
            enable_tls_page_service_api: false,
            dev_mode: false,
            timeline_import_config: TimelineImportConfig {
-                import_job_concurrency: NonZeroUsize::new(32).unwrap(),
-                import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
-                import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
+                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
+                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
            },
            basebackup_cache_config: None,
            posthog_config: None,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -354,9 +354,6 @@ pub struct ShardImportProgressV1 {
    pub completed: usize,
    /// Hash of the plan
    pub import_plan_hash: u64,
-    /// Soft limit for the job size
-    /// This needs to remain constant throughout the import
-    pub job_soft_size_limit: usize,
 }

 impl ShardImportStatus {
@@ -2045,7 +2042,7 @@ pub enum PagestreamProtocolVersion {

 pub type RequestId = u64;

-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamRequest {
    pub reqid: RequestId,
    pub request_lsn: Lsn,
@@ -2064,7 +2061,7 @@ pub struct PagestreamNblocksRequest {
    pub rel: RelTag,
 }

-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetPageRequest {
    pub hdr: PagestreamRequest,
    pub rel: RelTag,
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
 // Then we could replace the custom Ord and PartialOrd implementations below with
 // deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, Default, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -184,12 +184,12 @@ pub enum SlruKind {
    MultiXactOffsets,
 }

-impl fmt::Display for SlruKind {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl SlruKind {
+    pub fn to_str(&self) -> &'static str {
        match self {
-            Self::Clog => write!(f, "pg_xact"),
-            Self::MultiXactMembers => write!(f, "pg_multixact/members"),
-            Self::MultiXactOffsets => write!(f, "pg_multixact/offsets"),
+            Self::Clog => "pg_xact",
+            Self::MultiXactMembers => "pg_multixact/members",
+            Self::MultiXactOffsets => "pg_multixact/offsets",
        }
    }
 }
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -4,7 +4,6 @@ use std::{sync::Arc, time::Duration};

 use arc_swap::ArcSwap;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, info_span};

 use crate::{FeatureStore, PostHogClient, PostHogClientConfig};

@@ -27,35 +26,31 @@ impl FeatureResolverBackgroundLoop {
    pub fn spawn(self: Arc<Self>, handle: &tokio::runtime::Handle, refresh_period: Duration) {
        let this = self.clone();
        let cancel = self.cancel.clone();
-        handle.spawn(
-            async move {
-                tracing::info!("Starting PostHog feature resolver");
-                let mut ticker = tokio::time::interval(refresh_period);
-                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-                loop {
-                    tokio::select! {
-                        _ = ticker.tick() => {}
-                        _ = cancel.cancelled() => break
-                    }
-                    let resp = match this
-                        .posthog_client
-                        .get_feature_flags_local_evaluation()
-                        .await
-                    {
-                        Ok(resp) => resp,
-                        Err(e) => {
-                            tracing::warn!("Cannot get feature flags: {}", e);
-                            continue;
-                        }
-                    };
-                    let feature_store = FeatureStore::new_with_flags(resp.flags);
-                    this.feature_store.store(Arc::new(feature_store));
-                    tracing::info!("Feature flag updated");
+        handle.spawn(async move {
+            tracing::info!("Starting PostHog feature resolver");
+            let mut ticker = tokio::time::interval(refresh_period);
+            ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                tokio::select! {
+                    _ = ticker.tick() => {}
+                    _ = cancel.cancelled() => break
                }
-                tracing::info!("PostHog feature resolver stopped");
+                let resp = match this
+                    .posthog_client
+                    .get_feature_flags_local_evaluation()
+                    .await
+                {
+                    Ok(resp) => resp,
+                    Err(e) => {
+                        tracing::warn!("Cannot get feature flags: {}", e);
+                        continue;
+                    }
+                };
+                let feature_store = FeatureStore::new_with_flags(resp.flags);
+                this.feature_store.store(Arc::new(feature_store));
            }
-            .instrument(info_span!("posthog_feature_resolver")),
-        );
+            tracing::info!("PostHog feature resolver stopped");
+        });
    }

    pub fn feature_store(&self) -> Arc<FeatureStore> {
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -448,18 +448,6 @@ impl FeatureStore {
            )))
        }
    }
-
-    /// Infer whether a feature flag is a boolean flag by checking if it has a multivariate filter.
-    pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
-        if let Some(flag_config) = self.flags.get(flag_key) {
-            Ok(flag_config.filters.multivariate.is_none())
-        } else {
-            Err(PostHogEvaluationError::NotAvailable(format!(
-                "Not found in the local evaluation spec: {}",
-                flag_key
-            )))
-        }
-    }
 }

 pub struct PostHogClientConfig {
@@ -540,15 +528,7 @@ impl PostHogClient {
            .bearer_auth(&self.config.server_api_key)
            .send()
            .await?;
-        let status = response.status();
        let body = response.text().await?;
-        if !status.is_success() {
-            return Err(anyhow::anyhow!(
-                "Failed to get feature flags: {}, {}",
-                status,
-                body
-            ));
-        }
        Ok(serde_json::from_str(&body)?)
    }

--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -28,7 +28,6 @@ use std::time::Duration;
 use tokio::sync::Notify;
 use tokio::time::Instant;

-#[derive(Clone, Copy)]
 pub struct LeakyBucketConfig {
    /// This is the "time cost" of a single request unit.
    /// Should loosely represent how long it takes to handle a request unit in active resource time.
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,7 +73,6 @@ pub mod error;
 /// async timeout helper
 pub mod timeout;

-pub mod span;
 pub mod sync;

 pub mod failpoint_support;
--- a/libs/utils/src/span.rs
+++ b/libs/utils/src/span.rs
@@ -1,19 +0,0 @@
-//! Tracing span helpers.
-
-/// Records the given fields in the current span, as a single call. The fields must already have
-/// been declared for the span (typically with empty values).
-#[macro_export]
-macro_rules! span_record {
-    ($($tokens:tt)*) => {$crate::span_record_in!(::tracing::Span::current(), $($tokens)*)};
-}
-
-/// Records the given fields in the given span, as a single call. The fields must already have been
-/// declared for the span (typically with empty values).
-#[macro_export]
-macro_rules! span_record_in {
-    ($span:expr, $($tokens:tt)*) => {
-        if let Some(meta) = $span.metadata() {
-            $span.record_all(&tracing::valueset!(meta.fields(), $($tokens)*));
-        }
-    };
-}
--- a/pageserver/benches/bench_metrics.rs
+++ b/pageserver/benches/bench_metrics.rs
@@ -264,56 +264,10 @@ mod propagation_of_cached_label_value {
    }
 }

-criterion_group!(histograms, histograms::bench_bucket_scalability);
-mod histograms {
-    use std::time::Instant;
-
-    use criterion::{BenchmarkId, Criterion};
-    use metrics::core::Collector;
-
-    pub fn bench_bucket_scalability(c: &mut Criterion) {
-        let mut g = c.benchmark_group("bucket_scalability");
-
-        for n in [1, 4, 8, 16, 32, 64, 128, 256] {
-            g.bench_with_input(BenchmarkId::new("nbuckets", n), &n, |b, n| {
-                b.iter_custom(|iters| {
-                    let buckets: Vec<f64> = (0..*n).map(|i| i as f64 * 100.0).collect();
-                    let histo = metrics::Histogram::with_opts(
-                        metrics::prometheus::HistogramOpts::new("name", "help")
-                            .buckets(buckets.clone()),
-                    )
-                    .unwrap();
-                    let start = Instant::now();
-                    for i in 0..usize::try_from(iters).unwrap() {
-                        histo.observe(buckets[i % buckets.len()]);
-                    }
-                    let elapsed = start.elapsed();
-                    // self-test
-                    let mfs = histo.collect();
-                    assert_eq!(mfs.len(), 1);
-                    let metrics = mfs[0].get_metric();
-                    assert_eq!(metrics.len(), 1);
-                    let histo = metrics[0].get_histogram();
-                    let buckets = histo.get_bucket();
-                    assert!(
-                        buckets
-                            .iter()
-                            .enumerate()
-                            .all(|(i, b)| b.get_cumulative_count()
-                                >= i as u64 * (iters / buckets.len() as u64))
-                    );
-                    elapsed
-                })
-            });
-        }
-    }
-}
-
 criterion_main!(
    label_values,
    single_metric_multicore_scalability,
-    propagation_of_cached_label_value,
-    histograms,
+    propagation_of_cached_label_value
 );

 /*
@@ -336,14 +290,6 @@ propagation_of_cached_label_value__naive/nthreads/8 time:   [211.50 ns 214.44 ns
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time:   [14.135 ns 14.147 ns 14.160 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time:   [14.243 ns 14.255 ns 14.268 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time:   [14.470 ns 14.682 ns 14.895 ns]
-bucket_scalability/nbuckets/1     time:   [30.352 ns 30.353 ns 30.354 ns]
-bucket_scalability/nbuckets/4     time:   [30.464 ns 30.465 ns 30.467 ns]
-bucket_scalability/nbuckets/8     time:   [30.569 ns 30.575 ns 30.584 ns]
-bucket_scalability/nbuckets/16      time:   [30.961 ns 30.965 ns 30.969 ns]
-bucket_scalability/nbuckets/32      time:   [35.691 ns 35.707 ns 35.722 ns]
-bucket_scalability/nbuckets/64      time:   [47.829 ns 47.898 ns 47.974 ns]
-bucket_scalability/nbuckets/128     time:   [73.479 ns 73.512 ns 73.545 ns]
-bucket_scalability/nbuckets/256     time:   [127.92 ns 127.94 ns 127.96 ns]

 Results on an i3en.3xlarge instance

@@ -398,14 +344,6 @@ propagation_of_cached_label_value__naive/nthreads/8     time:   [434.87 ns 456.4
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [3.3767 ns 3.3974 ns 3.4220 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [3.6105 ns 4.2355 ns 5.1463 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [4.0889 ns 4.9714 ns 6.0779 ns]
-bucket_scalability/nbuckets/1     time:   [4.8455 ns 4.8542 ns 4.8646 ns]
-bucket_scalability/nbuckets/4     time:   [4.5663 ns 4.5722 ns 4.5787 ns]
-bucket_scalability/nbuckets/8     time:   [4.5531 ns 4.5670 ns 4.5842 ns]
-bucket_scalability/nbuckets/16      time:   [4.6392 ns 4.6524 ns 4.6685 ns]
-bucket_scalability/nbuckets/32      time:   [6.0302 ns 6.0439 ns 6.0589 ns]
-bucket_scalability/nbuckets/64      time:   [10.608 ns 10.644 ns 10.691 ns]
-bucket_scalability/nbuckets/128     time:   [22.178 ns 22.316 ns 22.483 ns]
-bucket_scalability/nbuckets/256     time:   [42.190 ns 42.328 ns 42.492 ns]

 Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor

@@ -424,13 +362,5 @@ propagation_of_cached_label_value__naive/nthreads/8     time:   [164.24 ns 170.1
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [2.2915 ns 2.2960 ns 2.3012 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [2.5726 ns 2.6158 ns 2.6624 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [2.7068 ns 2.8243 ns 2.9824 ns]
-bucket_scalability/nbuckets/1     time:   [6.3998 ns 6.4288 ns 6.4684 ns]
-bucket_scalability/nbuckets/4     time:   [6.3603 ns 6.3620 ns 6.3637 ns]
-bucket_scalability/nbuckets/8     time:   [6.1646 ns 6.1654 ns 6.1667 ns]
-bucket_scalability/nbuckets/16      time:   [6.1341 ns 6.1391 ns 6.1454 ns]
-bucket_scalability/nbuckets/32      time:   [8.2206 ns 8.2254 ns 8.2301 ns]
-bucket_scalability/nbuckets/64      time:   [13.988 ns 13.994 ns 14.000 ns]
-bucket_scalability/nbuckets/128     time:   [28.180 ns 28.216 ns 28.251 ns]
-bucket_scalability/nbuckets/256     time:   [54.914 ns 54.931 ns 54.951 ns]

 */
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -9,6 +9,7 @@ bytes.workspace = true
 pageserver_api.workspace = true
 postgres_ffi.workspace = true
 prost.workspace = true
+smallvec.workspace = true
 thiserror.workspace = true
 tonic.workspace = true
 utils.workspace = true
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -9,16 +9,10 @@
 //! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
 //!
 //! - Validate protocol invariants, via try_from() and try_into().
-//!
-//! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain
-//! types. This is where it matters -- the Protobuf types are less strict than the domain types, and
-//! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g.
-//! stream combinators without dealing with errors, and avoids validating the same message twice.
-
-use std::fmt::Display;

 use bytes::Bytes;
 use postgres_ffi::Oid;
+use smallvec::SmallVec;
 // TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
 // pulling in all of their other crate dependencies when building the client.
 use utils::lsn::Lsn;
@@ -54,8 +48,7 @@ pub struct ReadLsn {
    pub request_lsn: Lsn,
    /// If given, the caller guarantees that the page has not been modified since this LSN. Must be
    /// smaller than or equal to request_lsn. This allows the Pageserver to serve an old page
-    /// without waiting for the request LSN to arrive. If not given, the request will read at the
-    /// request_lsn and wait for it to arrive if necessary. Valid for all request types.
+    /// without waiting for the request LSN to arrive. Valid for all request types.
    ///
    /// It is undefined behaviour to make a request such that the page was, in fact, modified
    /// between request_lsn and not_modified_since_lsn. The Pageserver might detect it and return an
@@ -65,14 +58,19 @@ pub struct ReadLsn {
    pub not_modified_since_lsn: Option<Lsn>,
 }

-impl Display for ReadLsn {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let req_lsn = self.request_lsn;
-        if let Some(mod_lsn) = self.not_modified_since_lsn {
-            write!(f, "{req_lsn}>={mod_lsn}")
-        } else {
-            req_lsn.fmt(f)
+impl ReadLsn {
+    /// Validates the ReadLsn.
+    pub fn validate(&self) -> Result<(), ProtocolError> {
+        if self.request_lsn == Lsn::INVALID {
+            return Err(ProtocolError::invalid("request_lsn", self.request_lsn));
        }
+        if self.not_modified_since_lsn > Some(self.request_lsn) {
+            return Err(ProtocolError::invalid(
+                "not_modified_since_lsn",
+                self.not_modified_since_lsn,
+            ));
+        }
+        Ok(())
    }
 }

@@ -80,31 +78,27 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
    type Error = ProtocolError;

    fn try_from(pb: proto::ReadLsn) -> Result<Self, Self::Error> {
-        if pb.request_lsn == 0 {
-            return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
-        }
-        if pb.not_modified_since_lsn > pb.request_lsn {
-            return Err(ProtocolError::invalid(
-                "not_modified_since_lsn",
-                pb.not_modified_since_lsn,
-            ));
-        }
-        Ok(Self {
+        let read_lsn = Self {
            request_lsn: Lsn(pb.request_lsn),
            not_modified_since_lsn: match pb.not_modified_since_lsn {
                0 => None,
                lsn => Some(Lsn(lsn)),
            },
-        })
+        };
+        read_lsn.validate()?;
+        Ok(read_lsn)
    }
 }

-impl From<ReadLsn> for proto::ReadLsn {
-    fn from(read_lsn: ReadLsn) -> Self {
-        Self {
+impl TryFrom<ReadLsn> for proto::ReadLsn {
+    type Error = ProtocolError;
+
+    fn try_from(read_lsn: ReadLsn) -> Result<Self, Self::Error> {
+        read_lsn.validate()?;
+        Ok(Self {
            request_lsn: read_lsn.request_lsn.0,
            not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0,
-        }
+        })
    }
 }

@@ -159,15 +153,6 @@ impl TryFrom<proto::CheckRelExistsRequest> for CheckRelExistsRequest {
    }
 }

-impl From<CheckRelExistsRequest> for proto::CheckRelExistsRequest {
-    fn from(request: CheckRelExistsRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
-            rel: Some(request.rel.into()),
-        }
-    }
-}
-
 pub type CheckRelExistsResponse = bool;

 impl From<proto::CheckRelExistsResponse> for CheckRelExistsResponse {
@@ -205,12 +190,14 @@ impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
    }
 }

-impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
-    fn from(request: GetBaseBackupRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
+impl TryFrom<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
+    type Error = ProtocolError;
+
+    fn try_from(request: GetBaseBackupRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: Some(request.read_lsn.try_into()?),
            replica: request.replica,
-        }
+        })
    }
 }

@@ -227,9 +214,14 @@ impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
    }
 }

-impl From<GetBaseBackupResponseChunk> for proto::GetBaseBackupResponseChunk {
-    fn from(chunk: GetBaseBackupResponseChunk) -> Self {
-        Self { chunk }
+impl TryFrom<GetBaseBackupResponseChunk> for proto::GetBaseBackupResponseChunk {
+    type Error = ProtocolError;
+
+    fn try_from(chunk: GetBaseBackupResponseChunk) -> Result<Self, Self::Error> {
+        if chunk.is_empty() {
+            return Err(ProtocolError::Missing("chunk"));
+        }
+        Ok(Self { chunk })
    }
 }

@@ -254,12 +246,14 @@ impl TryFrom<proto::GetDbSizeRequest> for GetDbSizeRequest {
    }
 }

-impl From<GetDbSizeRequest> for proto::GetDbSizeRequest {
-    fn from(request: GetDbSizeRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
+impl TryFrom<GetDbSizeRequest> for proto::GetDbSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(request: GetDbSizeRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: Some(request.read_lsn.try_into()?),
            db_oid: request.db_oid,
-        }
+        })
    }
 }

@@ -294,7 +288,7 @@ pub struct GetPageRequest {
    /// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access
    /// costs and parallelizing them. This may increase the latency of any individual request, but
    /// improves the overall latency and throughput of the batch as a whole.
-    pub block_numbers: Vec<u32>,
+    pub block_numbers: SmallVec<[u32; 1]>,
 }

 impl TryFrom<proto::GetPageRequest> for GetPageRequest {
@@ -312,20 +306,25 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
                .ok_or(ProtocolError::Missing("read_lsn"))?
                .try_into()?,
            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
-            block_numbers: pb.block_number,
+            block_numbers: pb.block_number.into(),
        })
    }
 }

-impl From<GetPageRequest> for proto::GetPageRequest {
-    fn from(request: GetPageRequest) -> Self {
-        Self {
+impl TryFrom<GetPageRequest> for proto::GetPageRequest {
+    type Error = ProtocolError;
+
+    fn try_from(request: GetPageRequest) -> Result<Self, Self::Error> {
+        if request.block_numbers.is_empty() {
+            return Err(ProtocolError::Missing("block_number"));
+        }
+        Ok(Self {
            request_id: request.request_id,
            request_class: request.request_class.into(),
-            read_lsn: Some(request.read_lsn.into()),
+            read_lsn: Some(request.read_lsn.try_into()?),
            rel: Some(request.rel.into()),
-            block_number: request.block_numbers,
-        }
+            block_number: request.block_numbers.into_vec(),
+        })
    }
 }

@@ -397,7 +396,7 @@ pub struct GetPageResponse {
    /// A string describing the status, if any.
    pub reason: Option<String>,
    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    pub page_images: SmallVec<[Bytes; 1]>,
 }

 impl From<proto::GetPageResponse> for GetPageResponse {
@@ -406,7 +405,7 @@ impl From<proto::GetPageResponse> for GetPageResponse {
            request_id: pb.request_id,
            status_code: pb.status_code.into(),
            reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
+            page_images: pb.page_image.into(),
        }
    }
 }
@@ -417,7 +416,7 @@ impl From<GetPageResponse> for proto::GetPageResponse {
            request_id: response.request_id,
            status_code: response.status_code.into(),
            reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            page_image: response.page_images.into_vec(),
        }
    }
 }
@@ -506,12 +505,14 @@ impl TryFrom<proto::GetRelSizeRequest> for GetRelSizeRequest {
    }
 }

-impl From<GetRelSizeRequest> for proto::GetRelSizeRequest {
-    fn from(request: GetRelSizeRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
+impl TryFrom<GetRelSizeRequest> for proto::GetRelSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(request: GetRelSizeRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: Some(request.read_lsn.try_into()?),
            rel: Some(request.rel.into()),
-        }
+        })
    }
 }

@@ -554,13 +555,15 @@ impl TryFrom<proto::GetSlruSegmentRequest> for GetSlruSegmentRequest {
    }
 }

-impl From<GetSlruSegmentRequest> for proto::GetSlruSegmentRequest {
-    fn from(request: GetSlruSegmentRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
+impl TryFrom<GetSlruSegmentRequest> for proto::GetSlruSegmentRequest {
+    type Error = ProtocolError;
+
+    fn try_from(request: GetSlruSegmentRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: Some(request.read_lsn.try_into()?),
            kind: request.kind as u32,
            segno: request.segno,
-        }
+        })
    }
 }

@@ -577,9 +580,15 @@ impl TryFrom<proto::GetSlruSegmentResponse> for GetSlruSegmentResponse {
    }
 }

-impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
-    fn from(segment: GetSlruSegmentResponse) -> Self {
-        Self { segment }
+impl TryFrom<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
+    type Error = ProtocolError;
+
+    fn try_from(segment: GetSlruSegmentResponse) -> Result<Self, Self::Error> {
+        // TODO: can a segment legitimately be empty?
+        if segment.is_empty() {
+            return Err(ProtocolError::Missing("segment"));
+        }
+        Ok(Self { segment })
    }
 }

--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 futures.workspace = true
@@ -16,17 +15,14 @@ hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 rand.workspace = true
-reqwest.workspace = true
+reqwest.workspace=true
 serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 tokio.workspace = true
-tokio-stream.workspace = true
 tokio-util.workspace = true
-tonic.workspace = true

 pageserver_client.workspace = true
 pageserver_api.workspace = true
-pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -7,15 +7,11 @@ use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

 use anyhow::Context;
-use async_trait::async_trait;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::{
-    PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest,
-};
+use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::shard::TenantShardId;
-use pageserver_page_api::proto;
 use rand::prelude::*;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -26,12 +22,6 @@ use utils::lsn::Lsn;
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

-#[derive(clap::ValueEnum, Clone, Debug)]
-enum Protocol {
-    Libpq,
-    Grpc,
-}
-
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
@@ -45,8 +35,6 @@ pub(crate) struct Args {
    num_clients: NonZeroUsize,
    #[clap(long)]
    runtime: Option<humantime::Duration>,
-    #[clap(long, value_enum, default_value = "libpq")]
-    protocol: Protocol,
    /// Each client sends requests at the given rate.
    ///
    /// If a request takes too long and we should be issuing a new request already,
@@ -315,20 +303,7 @@ async fn main_impl(
                .unwrap();

        Box::pin(async move {
-            let client: Box<dyn Client> = match args.protocol {
-                Protocol::Libpq => Box::new(
-                    LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
-                        .await
-                        .unwrap(),
-                ),
-
-                Protocol::Grpc => Box::new(
-                    GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
-                        .await
-                        .unwrap(),
-                ),
-            };
-            run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
+            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
        })
    };

@@ -380,15 +355,23 @@ async fn main_impl(
    anyhow::Ok(())
 }

-async fn run_worker(
+async fn client_libpq(
    args: &Args,
-    mut client: Box<dyn Client>,
+    worker_id: WorkerId,
    shared_state: Arc<SharedState>,
    cancel: CancellationToken,
    rps_period: Option<Duration>,
    ranges: Vec<KeyRange>,
    weights: rand::distributions::weighted::WeightedIndex<i128>,
 ) {
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+        .await
+        .unwrap();
+
    shared_state.start_work_barrier.wait().await;
    let client_start = Instant::now();
    let mut ticks_processed = 0;
@@ -432,12 +415,12 @@ async fn run_worker(
                    blkno: block_no,
                }
            };
-            client.send_get_page(req).await.unwrap();
+            client.getpage_send(req).await.unwrap();
            inflight.push_back(start);
        }

        let start = inflight.pop_front().unwrap();
-        client.recv_get_page().await.unwrap();
+        client.getpage_recv().await.unwrap();
        let end = Instant::now();
        shared_state.live_stats.request_done();
        ticks_processed += 1;
@@ -459,104 +442,3 @@ async fn run_worker(
        }
    }
 }
-
-/// A benchmark client, to allow switching out the transport protocol.
-///
-/// For simplicity, this just uses separate asynchronous send/recv methods. The send method could
-/// return a future that resolves when the response is received, but we don't really need it.
-#[async_trait]
-trait Client: Send {
-    /// Sends an asynchronous GetPage request to the pageserver.
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>;
-
-    /// Receives the next GetPage response from the pageserver.
-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse>;
-}
-
-/// A libpq-based Pageserver client.
-struct LibpqClient {
-    inner: pageserver_client::page_service::PagestreamClient,
-}
-
-impl LibpqClient {
-    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
-        let inner = pageserver_client::page_service::Client::new(connstring)
-            .await?
-            .pagestream(ttid.tenant_id, ttid.timeline_id)
-            .await?;
-        Ok(Self { inner })
-    }
-}
-
-#[async_trait]
-impl Client for LibpqClient {
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
-        self.inner.getpage_send(req).await
-    }
-
-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
-        self.inner.getpage_recv().await
-    }
-}
-
-/// A gRPC client using the raw, no-frills gRPC client.
-struct GrpcClient {
-    req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
-    resp_rx: tonic::Streaming<proto::GetPageResponse>,
-}
-
-impl GrpcClient {
-    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
-        let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;
-
-        // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
-        // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
-        // buffered by Tonic and the OS too.
-        let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
-        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
-        let mut req = tonic::Request::new(req_stream);
-        let metadata = req.metadata_mut();
-        metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
-        metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
-        metadata.insert("neon-shard-id", "0000".try_into()?);
-
-        let resp = client.get_pages(req).await?;
-        let resp_stream = resp.into_inner();
-
-        Ok(Self {
-            req_tx,
-            resp_rx: resp_stream,
-        })
-    }
-}
-
-#[async_trait]
-impl Client for GrpcClient {
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
-        let req = proto::GetPageRequest {
-            request_id: 0,
-            request_class: proto::GetPageClass::Normal as i32,
-            read_lsn: Some(proto::ReadLsn {
-                request_lsn: req.hdr.request_lsn.0,
-                not_modified_since_lsn: req.hdr.not_modified_since.0,
-            }),
-            rel: Some(req.rel.into()),
-            block_number: vec![req.blkno],
-        };
-        self.req_tx.send(req).await?;
-        Ok(())
-    }
-
-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
-        let resp = self.resp_rx.message().await?.unwrap();
-        anyhow::ensure!(
-            resp.status_code == proto::GetPageStatusCode::Ok as i32,
-            "unexpected status code: {}",
-            resp.status_code
-        );
-        Ok(PagestreamGetPageResponse {
-            page: resp.page_image[0].clone(),
-            req: PagestreamGetPageRequest::default(), // dummy
-        })
-    }
-}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -65,30 +65,6 @@ impl From<GetVectoredError> for BasebackupError {
    }
 }

-impl From<BasebackupError> for postgres_backend::QueryError {
-    fn from(err: BasebackupError) -> Self {
-        use postgres_backend::QueryError;
-        use pq_proto::framed::ConnectionError;
-        match err {
-            BasebackupError::Client(err, _) => QueryError::Disconnected(ConnectionError::Io(err)),
-            BasebackupError::Server(err) => QueryError::Other(err),
-            BasebackupError::Shutdown => QueryError::Shutdown,
-        }
-    }
-}
-
-impl From<BasebackupError> for tonic::Status {
-    fn from(err: BasebackupError) -> Self {
-        use tonic::Code;
-        let code = match &err {
-            BasebackupError::Client(_, _) => Code::Cancelled,
-            BasebackupError::Server(_) => Code::Internal,
-            BasebackupError::Shutdown => Code::Unavailable,
-        };
-        tonic::Status::new(code, err.to_string())
-    }
-}
-
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -272,7 +248,7 @@ where
    async fn flush(&mut self) -> Result<(), BasebackupError> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
-        let segname = format!("{kind}/{segno:>04X}");
+        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar
            .append(&header, self.buf.as_slice())
@@ -371,7 +347,7 @@ where
                .await?
                .partition(
                    self.timeline.get_shard_identity(),
-                    self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
+                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
                );

            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,10 +14,7 @@ use std::time::Duration;
 use anyhow::{Context, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
-use pageserver_api::config::{
-    DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes,
-    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig,
-};
+use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes, PostHogConfig};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use pem::Pem;
@@ -188,9 +185,6 @@ pub struct PageServerConf {

    pub max_vectored_read_bytes: MaxVectoredReadBytes,

-    /// Maximum number of keys to be read in a single get_vectored call.
-    pub max_get_vectored_keys: MaxGetVectoredKeys,
-
    pub image_compression: ImageCompressionAlgorithm,

    /// Whether to offload archived timelines automatically
@@ -410,7 +404,6 @@ impl PageServerConf {
            secondary_download_concurrency,
            ingest_batch_size,
            max_vectored_read_bytes,
-            max_get_vectored_keys,
            image_compression,
            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
@@ -477,7 +470,6 @@ impl PageServerConf {
            secondary_download_concurrency,
            ingest_batch_size,
            max_vectored_read_bytes,
-            max_get_vectored_keys,
            image_compression,
            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
@@ -606,19 +598,6 @@ impl PageServerConf {
                )
            })?;

-        if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
-            max_batch_size,
-            ..
-        }) = conf.page_service_pipelining
-        {
-            if max_batch_size.get() > conf.max_get_vectored_keys.get() {
-                return Err(anyhow::anyhow!(
-                    "`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})",
-                    conf.max_get_vectored_keys.get()
-                ));
-            }
-        };
-
        Ok(conf)
    }

@@ -706,7 +685,6 @@ impl ConfigurableSemaphore {
 mod tests {

    use camino::Utf8PathBuf;
-    use rstest::rstest;
    use utils::id::NodeId;

    use super::PageServerConf;
@@ -746,28 +724,4 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect_err("parse_and_validate should fail for endpoint without scheme");
    }
-
-    #[rstest]
-    #[case(32, 32, true)]
-    #[case(64, 32, false)]
-    #[case(64, 64, true)]
-    #[case(128, 128, true)]
-    fn test_config_max_batch_size_is_valid(
-        #[case] max_batch_size: usize,
-        #[case] max_get_vectored_keys: usize,
-        #[case] is_valid: bool,
-    ) {
-        let input = format!(
-            r#"
-            control_plane_api = "http://localhost:6666"
-            max_get_vectored_keys = {max_get_vectored_keys}
-            page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }}
-        "#,
-        );
-        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
-            .expect("config has valid fields");
-        let workdir = Utf8PathBuf::from("/nonexistent");
-        let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
-        assert_eq!(result.is_ok(), is_valid);
-    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -837,30 +837,7 @@ async fn collect_eviction_candidates(
                continue;
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
-            debug!(
-                tenant_id=%tl.tenant_shard_id.tenant_id,
-                shard_id=%tl.tenant_shard_id.shard_slug(),
-                timeline_id=%tl.timeline_id,
-                "timeline resident layers count: {}", info.resident_layers.len()
-            );
-
-            tenant_candidates.extend(info.resident_layers.into_iter());
-            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
-
-            if cancel.is_cancelled() {
-                return Ok(EvictionCandidates::Cancelled);
-            }
-        }
-
-        // Also consider layers of timelines being imported for eviction
-        for tl in tenant.list_importing_timelines() {
-            let info = tl.timeline.get_local_layers_for_disk_usage_eviction().await;
-            debug!(
-                tenant_id=%tl.timeline.tenant_shard_id.tenant_id,
-                shard_id=%tl.timeline.tenant_shard_id.shard_slug(),
-                timeline_id=%tl.timeline.timeline_id,
-                "timeline resident layers count: {}", info.resident_layers.len()
-            );
+            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());

            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -91,14 +91,4 @@ impl FeatureResolver {
            ))
        }
    }
-
-    pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
-        if let Some(inner) = &self.inner {
-            inner.feature_store().is_feature_flag_boolean(flag_key)
-        } else {
-            Err(PostHogEvaluationError::NotAvailable(
-                "PostHog integration is not enabled".to_string(),
-            ))
-        }
-    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3663,46 +3663,6 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
    Ok(())
 }

-async fn tenant_evaluate_feature_flag(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let flag: String = must_parse_query_param(&request, "flag")?;
-    let as_type: String = must_parse_query_param(&request, "as")?;
-
-    let state = get_state(&request);
-
-    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-        if as_type == "boolean" {
-            let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
-            let result = result.map(|_| true).map_err(|e| e.to_string());
-            json_response(StatusCode::OK, result)
-        } else if as_type == "multivariate" {
-            let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
-            json_response(StatusCode::OK, result)
-        } else {
-            // Auto infer the type of the feature flag.
-            let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
-            if is_boolean {
-                let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
-                let result = result.map(|_| true).map_err(|e| e.to_string());
-                json_response(StatusCode::OK, result)
-            } else {
-                let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
-                json_response(StatusCode::OK, result)
-            }
-        }
-    }
-    .instrument(info_span!("tenant_evaluate_feature_flag", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
-    .await
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -4079,8 +4039,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
            |r| api_handler(r, activate_post_import_handler),
        )
-        .get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
-            api_handler(r, tenant_evaluate_feature_flag)
-        })
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -15,7 +15,6 @@ use metrics::{
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;
-use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS;
 use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
@@ -33,6 +32,7 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
+use crate::tenant::Timeline;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
@@ -1312,44 +1312,11 @@ impl EvictionsWithLowResidenceDuration {
 //
 // Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.00005,  // 50us
-    0.00006,  // 60us
-    0.00007,  // 70us
-    0.00008,  // 80us
-    0.00009,  // 90us
-    0.0001,   // 100us
-    0.000110, // 110us
-    0.000120, // 120us
-    0.000130, // 130us
-    0.000140, // 140us
-    0.000150, // 150us
-    0.000160, // 160us
-    0.000170, // 170us
-    0.000180, // 180us
-    0.000190, // 190us
-    0.000200, // 200us
-    0.000210, // 210us
-    0.000220, // 220us
-    0.000230, // 230us
-    0.000240, // 240us
-    0.000250, // 250us
-    0.000300, // 300us
-    0.000350, // 350us
-    0.000400, // 400us
-    0.000450, // 450us
-    0.000500, // 500us
-    0.000600, // 600us
-    0.000700, // 700us
-    0.000800, // 800us
-    0.000900, // 900us
-    0.001000, // 1ms
-    0.002000, // 2ms
-    0.003000, // 3ms
-    0.004000, // 4ms
-    0.005000, // 5ms
-    0.01000,  // 10ms
-    0.02000,  // 20ms
-    0.05000,  // 50ms
+    0.000030, // 30 usec
+    0.001000, // 1000 usec
+    0.030,    // 30 ms
+    1.000,    // 1000 ms
+    30.000,   // 30000 ms
 ];

 /// VirtualFile fs operation variants.
@@ -1939,7 +1906,7 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
-    (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap())
+    (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
        .map(|v| v.into())
        .collect()
 });
@@ -1957,7 +1924,7 @@ static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(
    let mut buckets = Vec::new();
    for i in 0.. {
        let bucket = 1 << i;
-        if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() {
+        if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
            break;
        }
        buckets.push(bucket.into());
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -12,9 +12,9 @@ use std::task::{Context, Poll};
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

-use anyhow::{Context as _, anyhow, bail};
+use anyhow::{Context as _, bail};
 use async_compression::tokio::write::GzipEncoder;
-use bytes::{Buf, BytesMut};
+use bytes::Buf;
 use futures::future::BoxFuture;
 use futures::{FutureExt, Stream};
 use itertools::Itertools;
@@ -45,12 +45,13 @@ use pq_proto::framed::ConnectionError;
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
 use smallvec::{SmallVec, smallvec};
 use strum_macros::IntoStaticStr;
-use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, BufWriter};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tonic::service::Interceptor as _;
 use tracing::*;
 use utils::auth::{Claims, Scope, SwappableJwtAuth};
+use utils::failpoint_support;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::logging::log_slow;
 use utils::lsn::Lsn;
@@ -58,7 +59,6 @@ use utils::shard::ShardIndex;
 use utils::simple_rcu::RcuReadGuard;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
-use utils::{failpoint_support, span_record};

 use crate::auth::check_permission;
 use crate::basebackup::{self, BasebackupError};
@@ -81,8 +81,8 @@ use crate::tenant::mgr::{
    GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager,
 };
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::handle::{Handle, HandleUpgradeError, WeakHandle};
-use crate::tenant::timeline::{self, WaitLsnError, WaitLsnTimeout, WaitLsnWaiter};
+use crate::tenant::timeline::handle::{HandleUpgradeError, WeakHandle};
+use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation};

@@ -117,6 +117,22 @@ const GRPC_MAX_CONCURRENT_STREAMS: u32 = 256;

 ///////////////////////////////////////////////////////////////////////////////

+/// Records all of the given fields in the current span, as a single call. The fields must already
+/// have been declared for the span with empty values.
+macro_rules! span_record {
+    ($($tokens:tt)*) => {span_record_in!(::tracing::Span::current(); $($tokens)*)};
+}
+
+/// Records all of the given fields in the given span, as a single call. The fields must already
+/// have been declared for the span with empty values.
+macro_rules! span_record_in {
+    ($span:expr; $($tokens:tt)*) => {
+        if let Some(meta) = $span.metadata() {
+            $span.record_all(&tracing::valueset!(meta.fields(), $($tokens)*));
+        }
+    };
+}
+
 pub struct Listener {
    cancel: CancellationToken,
    /// Cancel the listener task through `listen_cancel` to shut down the listener
@@ -200,6 +216,7 @@ pub fn spawn_grpc(
    // Set up the gRPC server.
    //
    // TODO: consider tuning window sizes.
+    // TODO: wire up tracing.
    let mut server = tonic::transport::Server::builder()
        .http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
        .http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
@@ -210,7 +227,7 @@ pub fn spawn_grpc(
    // * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
    //
    // * Layers: allow async code, can run code after the service response. However, only has access
-    //   to the raw HTTP request/response, not the gRPC types.
+    //   to the raw HTTP request/response.
    let page_service_handler = GrpcPageServiceHandler {
        tenant_manager,
        ctx,
@@ -221,17 +238,21 @@ pub fn spawn_grpc(
    let mut auth_interceptor = TenantAuthInterceptor::new(auth);

    let page_service = tower::ServiceBuilder::new()
-        // Create tracing span and record request start time.
+        // Create tracing span and start timing.
        .layer(observability_layer)
        // Intercept gRPC requests.
-        .layer(tonic::service::InterceptorLayer::new(move |mut req| {
-            // Extract tenant metadata.
-            req = tenant_interceptor.call(req)?;
-            // Authenticate tenant JWT token.
-            req = auth_interceptor.call(req)?;
-            Ok(req)
-        }))
+        .layer(tonic::service::InterceptorLayer::new(
+            move |mut req: tonic::Request<()>| {
+                // Extract tenant metadata.
+                req = tenant_interceptor.call(req)?;
+                // Authenticate tenant JWT token.
+                req = auth_interceptor.call(req)?;
+                Ok(req)
+            },
+        ))
+        // Obtain timeline handle.
        .service(proto::PageServiceServer::new(page_service_handler));
+
    let server = server.add_service(page_service);

    // Reflection service for use with e.g. grpcurl.
@@ -550,7 +571,7 @@ impl TimelineHandles {
        tenant_id: TenantId,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
-    ) -> Result<Handle<TenantManagerTypes>, GetActiveTimelineError> {
+    ) -> Result<timeline::handle::Handle<TenantManagerTypes>, GetActiveTimelineError> {
        if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id {
            return Err(GetActiveTimelineError::Tenant(
                GetActiveTenantError::SwitchedTenant,
@@ -765,7 +786,7 @@ impl PageStreamError {
            request_id,
            status_code,
            reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            page_images: SmallVec::new(),
        }
        .into())
    }
@@ -774,22 +795,29 @@ impl PageStreamError {
 impl From<PageStreamError> for tonic::Status {
    fn from(err: PageStreamError) -> Self {
        use tonic::Code;
-        let message = err.to_string();
-        let code = match err {
+        let code = match &err {
            PageStreamError::Reconnect(_) => Code::Unavailable,
            PageStreamError::Shutdown => Code::Unavailable,
            PageStreamError::Read(err) => match err {
                PageReconstructError::Cancelled => Code::Unavailable,
                PageReconstructError::MissingKey(_) => Code::NotFound,
-                PageReconstructError::AncestorLsnTimeout(err) => tonic::Status::from(err).code(),
+                PageReconstructError::AncestorLsnTimeout(err) => match err {
+                    WaitLsnError::Timeout(_) => Code::Internal,
+                    WaitLsnError::BadState(_) => Code::Internal,
+                    WaitLsnError::Shutdown => Code::Unavailable,
+                },
                PageReconstructError::Other(_) => Code::Internal,
                PageReconstructError::WalRedo(_) => Code::Internal,
            },
-            PageStreamError::LsnTimeout(err) => tonic::Status::from(err).code(),
+            PageStreamError::LsnTimeout(err) => match err {
+                WaitLsnError::Timeout(_) => Code::Internal,
+                WaitLsnError::BadState(_) => Code::Internal,
+                WaitLsnError::Shutdown => Code::Unavailable,
+            },
            PageStreamError::NotFound(_) => Code::NotFound,
            PageStreamError::BadRequest(_) => Code::InvalidArgument,
        };
-        tonic::Status::new(code, message)
+        tonic::Status::new(code, format!("{err}"))
    }
 }

@@ -816,6 +844,22 @@ impl From<GetActiveTimelineError> for PageStreamError {
    }
 }

+impl From<GetActiveTenantError> for QueryError {
+    fn from(e: GetActiveTenantError) -> Self {
+        match e {
+            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+            ),
+            GetActiveTenantError::Cancelled
+            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
+            }
+            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
+            e => QueryError::Other(anyhow::anyhow!(e)),
+        }
+    }
+}
+
 impl From<WaitLsnError> for PageStreamError {
    fn from(value: WaitLsnError) -> Self {
        match value {
@@ -1425,7 +1469,7 @@ impl PageServerHandler {

    /// Starts a SmgrOpTimer at received_at and throttles the request.
    async fn record_op_start_and_throttle(
-        shard: &Handle<TenantManagerTypes>,
+        shard: &timeline::handle::Handle<TenantManagerTypes>,
        op: metrics::SmgrQueryType,
        received_at: Instant,
    ) -> Result<SmgrOpTimer, QueryError> {
@@ -2709,6 +2753,15 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        fn map_basebackup_error(err: BasebackupError) -> QueryError {
+            match err {
+                // TODO: passthrough the error site to the final error message?
+                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
+                BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
+            }
+        }
+
        let started = std::time::Instant::now();

        let timeline = self
@@ -2766,7 +2819,8 @@ impl PageServerHandler {
                replica,
                &ctx,
            )
-            .await?;
+            .await
+            .map_err(map_basebackup_error)?;
        } else {
            let mut writer = BufWriter::new(pgb.copyout_writer());

@@ -2789,8 +2843,11 @@ impl PageServerHandler {
                from_cache = true;
                tokio::io::copy(&mut cached, &mut writer)
                    .await
-                    .map_err(|err| {
-                        BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
+                    .map_err(|e| {
+                        map_basebackup_error(BasebackupError::Client(
+                            e,
+                            "handle_basebackup_request,cached,copy",
+                        ))
                    })?;
            } else if gzip {
                let mut encoder = GzipEncoder::with_quality(
@@ -2811,7 +2868,8 @@ impl PageServerHandler {
                    replica,
                    &ctx,
                )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
                // shutdown the encoder to ensure the gzip footer is written
                encoder
                    .shutdown()
@@ -2827,12 +2885,15 @@ impl PageServerHandler {
                    replica,
                    &ctx,
                )
-                .await?;
-            }
-            writer
-                .flush()
                .await
-                .map_err(|err| BasebackupError::Client(err, "handle_basebackup_request,flush"))?;
+                .map_err(map_basebackup_error)?;
+            }
+            writer.flush().await.map_err(|e| {
+                map_basebackup_error(BasebackupError::Client(
+                    e,
+                    "handle_basebackup_request,flush",
+                ))
+            })?;
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)
@@ -3358,6 +3419,7 @@ where

 /// Serves the page service over gRPC. Dispatches to PageServerHandler for request processing.
 ///
+/// TODO: add trace spans, interceptors, and sampling.
 /// TODO: rename to PageServiceHandler when libpq impl is removed.
 pub struct GrpcPageServiceHandler {
    tenant_manager: Arc<TenantManager>,
@@ -3366,10 +3428,12 @@ pub struct GrpcPageServiceHandler {

 impl GrpcPageServiceHandler {
    /// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
-    /// relations and their sizes, as well as SLRU segments and similar data.
+    /// relations and their sizes, as well as SLRU segments and other data.
+    ///
+    /// TODO: take the timeline handle instead.
    #[allow(clippy::result_large_err)]
-    fn ensure_shard_zero(timeline: &Handle<TenantManagerTypes>) -> Result<(), tonic::Status> {
-        match timeline.get_shard_index().shard_number.0 {
+    fn ensure_shard_zero(req: &tonic::Request<impl Any>) -> Result<(), tonic::Status> {
+        match extract::<ShardIndex>(req).shard_number.0 {
            0 => Ok(()),
            shard => Err(tonic::Status::invalid_argument(format!(
                "request must execute on shard zero (is shard {shard})",
@@ -3382,31 +3446,22 @@ impl GrpcPageServiceHandler {
        PagestreamRequest {
            reqid: req_id,
            request_lsn: read_lsn.request_lsn,
-            not_modified_since: read_lsn
-                .not_modified_since_lsn
-                .unwrap_or(read_lsn.request_lsn),
+            not_modified_since: read_lsn.not_modified_since_lsn.unwrap_or_default(),
        }
    }

-    /// Acquires a timeline handle for the given request.
-    ///
-    /// TODO: during shard splits, the compute may still be sending requests to the parent shard
-    /// until the entire split is committed and the compute is notified. Consider installing a
-    /// temporary shard router from the parent to the children while the split is in progress.
-    ///
-    /// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage
-    /// the TimelineHandles lifecycle.
-    ///
-    /// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid
-    /// the unnecessary overhead.
+    /// Acquires a timeline handle for the given request. The request must have been decorated by
+    /// TenantMetadataInterceptor first.
    async fn get_request_timeline(
        &self,
        req: &tonic::Request<impl Any>,
-    ) -> Result<Handle<TenantManagerTypes>, GetActiveTimelineError> {
+    ) -> Result<timeline::handle::Handle<TenantManagerTypes>, GetActiveTimelineError> {
        let ttid = *extract::<TenantTimelineId>(req);
        let shard_index = *extract::<ShardIndex>(req);
        let shard_selector = ShardSelector::Known(shard_index);

+        // TODO: untangle this from TenantManagerWrapper::resolve() and Cache::get(), to avoid the
+        // unnecessary overhead.
        TimelineHandles::new(self.tenant_manager.clone())
            .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
            .await
@@ -3415,10 +3470,10 @@ impl GrpcPageServiceHandler {
    /// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start.
    /// Only errors if the timeline is shutting down.
    ///
-    /// TODO: move timer construction to ObservabilityLayer (see TODO there).
-    /// TODO: decouple rate limiting (middleware?), and return SlowDown errors instead.
+    /// TODO: revamp request timers -- in particular,
+    /// TODO: consider moving throttling out and returning SlowDown errors.
    async fn record_op_start_and_throttle(
-        timeline: &Handle<TenantManagerTypes>,
+        timeline: &timeline::handle::Handle<TenantManagerTypes>,
        op: metrics::SmgrQueryType,
        received_at: Instant,
    ) -> Result<SmgrOpTimer, tonic::Status> {
@@ -3437,11 +3492,7 @@ impl GrpcPageServiceHandler {
    ///
    /// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse
    /// with an appropriate status code instead.
-    ///
-    /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
-    /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
-    /// split them up in the client or server.
-    #[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
+    #[instrument(skip_all, fields(req_id, rel, blkno))]
    async fn get_page(
        ctx: &RequestContext,
        timeline: &WeakHandle<TenantManagerTypes>,
@@ -3449,29 +3500,24 @@ impl GrpcPageServiceHandler {
        io_concurrency: IoConcurrency,
    ) -> Result<proto::GetPageResponse, tonic::Status> {
        let received_at = Instant::now();
-        let timeline = timeline.upgrade()?;
-        let ctx = ctx.with_scope_page_service_pagestream(&timeline);
+        let timeline = timeline.upgrade().map_err(|err| match err {
+            HandleUpgradeError::ShutDown => tonic::Status::unavailable("timeline is shutting down"),
+        })?;

-        // Validate the request, decorate the span, and convert it to a Pagestream request.
+        // Validate the request and convert it to a Pagestream request.
        let req: page_api::GetPageRequest = req.try_into()?;

-        span_record!(
-            req_id = %req.request_id,
-            rel = %req.rel,
-            blkno = %req.block_numbers[0],
-            blks = %req.block_numbers.len(),
-            lsn = %req.read_lsn,
-        );
+        span_record!(req_id = %req.request_id, rel = %req.rel, blkno = %req.block_numbers[0]);

-        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
+        info!("XXX");
+
+        let ctx = ctx.with_scope_page_service_pagestream(&timeline);
        let effective_lsn = match PageServerHandler::effective_request_lsn(
            &timeline,
            timeline.get_last_record_lsn(),
            req.read_lsn.request_lsn,
-            req.read_lsn
-                .not_modified_since_lsn
-                .unwrap_or(req.read_lsn.request_lsn),
-            &latest_gc_cutoff_lsn,
+            req.read_lsn.not_modified_since_lsn.unwrap_or_default(),
+            &timeline.get_applied_gc_cutoff_lsn(),
        ) {
            Ok(lsn) => lsn,
            Err(err) => return err.into_get_page_response(req.request_id),
@@ -3505,9 +3551,6 @@ impl GrpcPageServiceHandler {
            });
        }

-        // TODO: this does a relation size query for every page in the batch. Since this batch is
-        // all for one relation, we could do this only once. However, this is not the case for the
-        // libpq implementation.
        let results = PageServerHandler::handle_get_page_at_lsn_request_batched(
            &timeline,
            batch,
@@ -3521,7 +3564,7 @@ impl GrpcPageServiceHandler {
            request_id: req.request_id,
            status_code: page_api::GetPageStatusCode::Ok,
            reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            page_images: SmallVec::with_capacity(results.len()),
        };

        for result in results {
@@ -3542,8 +3585,8 @@ impl GrpcPageServiceHandler {

 /// Implements the gRPC page service.
 ///
-/// TODO: cancellation.
-/// TODO: when the libpq impl is removed, remove the Pagestream types and inline the handler code.
+/// TODO: when libpq impl is removed, remove intermediate Pagestream types and inline the handlers.
+/// TODO: Tower middleware for timeline handle, rate limiting, and timing.
 #[tonic::async_trait]
 impl proto::PageService for GrpcPageServiceHandler {
    type GetBaseBackupStream = Pin<
@@ -3553,7 +3596,7 @@ impl proto::PageService for GrpcPageServiceHandler {
    type GetPagesStream =
        Pin<Box<dyn Stream<Item = Result<proto::GetPageResponse, tonic::Status>> + Send>>;

-    #[instrument(skip_all, fields(rel, lsn))]
+    #[instrument(skip_all, fields(rel, req_lsn))]
    async fn check_rel_exists(
        &self,
        req: tonic::Request<proto::CheckRelExistsRequest>,
@@ -3562,11 +3605,11 @@ impl proto::PageService for GrpcPageServiceHandler {
        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);

-        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        Self::ensure_shard_zero(&timeline)?;
-        let req: page_api::CheckRelExistsRequest = req.into_inner().try_into()?;
+        // Validate the request and convert it to a Pagestream request.
+        Self::ensure_shard_zero(&req)?;

-        span_record!(rel=%req.rel, lsn=%req.read_lsn);
+        let req: page_api::CheckRelExistsRequest = req.into_inner().try_into()?;
+        span_record!(rel = %req.rel, req_lsn = %req.read_lsn.request_lsn);

        let req = PagestreamExistsRequest {
            hdr: Self::make_hdr(req.read_lsn, 0),
@@ -3586,94 +3629,14 @@ impl proto::PageService for GrpcPageServiceHandler {
        Ok(tonic::Response::new(resp.into()))
    }

-    // TODO: ensure clients use gzip compression for the stream.
-    #[instrument(skip_all, fields(lsn))]
+    #[instrument(skip_all)]
    async fn get_base_backup(
        &self,
-        req: tonic::Request<proto::GetBaseBackupRequest>,
+        _: tonic::Request<proto::GetBaseBackupRequest>,
    ) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
-        // Send 64 KB chunks to avoid large memory allocations.
-        const CHUNK_SIZE: usize = 64 * 1024;
-
-        let timeline = self.get_request_timeline(&req).await?;
-        let ctx = self.ctx.with_scope_timeline(&timeline);
-
-        // Validate the request, decorate the span, and wait for the LSN to arrive.
-        //
-        // TODO: this requires a read LSN, is that ok?
-        Self::ensure_shard_zero(&timeline)?;
-        if timeline.is_archived() == Some(true) {
-            return Err(tonic::Status::failed_precondition("timeline is archived"));
-        }
-        let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
-
-        span_record!(lsn=%req.read_lsn);
-
-        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
-        timeline
-            .wait_lsn(
-                req.read_lsn.request_lsn,
-                WaitLsnWaiter::PageService,
-                WaitLsnTimeout::Default,
-                &ctx,
-            )
-            .await?;
-        timeline
-            .check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn)
-            .map_err(|err| {
-                tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
-            })?;
-
-        // Spawn a task to run the basebackup.
-        //
-        // TODO: do we need to support full base backups, for debugging?
-        let span = Span::current();
-        let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
-        let jh = tokio::spawn(async move {
-            let result = basebackup::send_basebackup_tarball(
-                &mut simplex_write,
-                &timeline,
-                Some(req.read_lsn.request_lsn),
-                None,
-                false,
-                req.replica,
-                &ctx,
-            )
-            .instrument(span) // propagate request span
-            .await;
-            simplex_write.shutdown().await.map_err(|err| {
-                BasebackupError::Server(anyhow!("simplex shutdown failed: {err}"))
-            })?;
-            result
-        });
-
-        // Emit chunks of size CHUNK_SIZE.
-        let chunks = async_stream::try_stream! {
-            let mut chunk = BytesMut::with_capacity(CHUNK_SIZE);
-            loop {
-                let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
-                    tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
-                })?;
-
-                // If we read 0 bytes, either the chunk is full or the stream is closed.
-                if n == 0 {
-                    if chunk.is_empty() {
-                        break;
-                    }
-                    yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze());
-                    chunk.clear();
-                }
-            }
-            // Wait for the basebackup task to exit and check for errors.
-            jh.await.map_err(|err| {
-                tonic::Status::internal(format!("basebackup failed: {err}"))
-            })??;
-        };
-
-        Ok(tonic::Response::new(Box::pin(chunks)))
+        Err(tonic::Status::unimplemented("not implemented")) // TODO
    }

-    #[instrument(skip_all, fields(db_oid, lsn))]
    async fn get_db_size(
        &self,
        req: tonic::Request<proto::GetDbSizeRequest>,
@@ -3682,12 +3645,9 @@ impl proto::PageService for GrpcPageServiceHandler {
        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);

-        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        Self::ensure_shard_zero(&timeline)?;
+        // Validate the request and convert it to a Pagestream request.
+        Self::ensure_shard_zero(&req)?;
        let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?;
-
-        span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
-
        let req = PagestreamDbSizeRequest {
            hdr: Self::make_hdr(req.read_lsn, 0),
            dbnode: req.db_oid,
@@ -3706,7 +3666,6 @@ impl proto::PageService for GrpcPageServiceHandler {
        Ok(tonic::Response::new(resp.into()))
    }

-    // NB: don't instrument this, instrument each streamed request.
    async fn get_pages(
        &self,
        req: tonic::Request<tonic::Streaming<proto::GetPageRequest>>,
@@ -3721,8 +3680,8 @@ impl proto::PageService for GrpcPageServiceHandler {
            .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
            .await?;

-        let span = Span::current();
        let ctx = self.ctx.attached_child();
+        let span = tracing::Span::current(); // propagate span into the stream future
        let mut reqs = req.into_inner();

        let resps = async_stream::try_stream! {
@@ -3732,16 +3691,20 @@ impl proto::PageService for GrpcPageServiceHandler {
                .downgrade();
            while let Some(req) = reqs.message().await? {
                // TODO: implement IoConcurrency sidecar.
-                yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential)
-                    .instrument(span.clone()) // propagate request span
-                    .await?
+                yield Self::get_page(
+                    &ctx,
+                    &timeline,
+                    req,
+                    IoConcurrency::Sequential,
+                )
+                .instrument(span.clone())
+                .await?
            }
        };

        Ok(tonic::Response::new(Box::pin(resps)))
    }

-    #[instrument(skip_all, fields(rel, lsn))]
    async fn get_rel_size(
        &self,
        req: tonic::Request<proto::GetRelSizeRequest>,
@@ -3750,12 +3713,9 @@ impl proto::PageService for GrpcPageServiceHandler {
        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);

-        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        Self::ensure_shard_zero(&timeline)?;
+        // Validate the request and convert it to a Pagestream request.
+        Self::ensure_shard_zero(&req)?;
        let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
-
-        span_record!(rel=%req.rel, lsn=%req.read_lsn);
-
        let req = PagestreamNblocksRequest {
            hdr: Self::make_hdr(req.read_lsn, 0),
            rel: req.rel,
@@ -3774,7 +3734,6 @@ impl proto::PageService for GrpcPageServiceHandler {
        Ok(tonic::Response::new(resp.into()))
    }

-    #[instrument(skip_all, fields(kind, segno, lsn))]
    async fn get_slru_segment(
        &self,
        req: tonic::Request<proto::GetSlruSegmentRequest>,
@@ -3783,12 +3742,9 @@ impl proto::PageService for GrpcPageServiceHandler {
        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);

-        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        Self::ensure_shard_zero(&timeline)?;
+        // Validate the request and convert it to a Pagestream request.
+        Self::ensure_shard_zero(&req)?;
        let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?;
-
-        span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
-
        let req = PagestreamGetSlruSegmentRequest {
            hdr: Self::make_hdr(req.read_lsn, 0),
            kind: req.kind as u8,
@@ -3806,42 +3762,40 @@ impl proto::PageService for GrpcPageServiceHandler {
        let resp =
            PageServerHandler::handle_get_slru_segment_request(&timeline, &req, &ctx).await?;
        let resp: page_api::GetSlruSegmentResponse = resp.segment;
-        Ok(tonic::Response::new(resp.into()))
+        Ok(tonic::Response::new(resp.try_into()?))
    }
 }

-/// gRPC middleware layer that handles observability concerns:
+/// Extracts the given type from the request extensions, or panics if it is missing.
+fn extract<T: Send + Sync + 'static>(req: &tonic::Request<impl Any>) -> &T {
+    let Some(value) = req.extensions().get::<T>() else {
+        let name = std::any::type_name::<T>();
+        panic!("extension {name} should be set by interceptor or layer");
+    };
+    value
+}
+
+/// gRPC layer that handles observability concerns:
 ///
-/// * Creates and enters a tracing span.
-/// * Records the request start time as a ReceivedAt request extension.
-///
-/// TODO: add perf tracing.
-/// TODO: add timing and metrics.
-/// TODO: add logging.
+/// * Records the initial request timestamp as a ReceivedAt extension.
+/// * Creates a tracing span with request metadata and instruments the future.
 #[derive(Clone)]
 struct ObservabilityLayer;

 impl<S: tonic::server::NamedService> tower::Layer<S> for ObservabilityLayer {
-    type Service = ObservabilityLayerService<S>;
+    type Service = ObservabilityService<S>;

    fn layer(&self, inner: S) -> Self::Service {
-        Self::Service { inner }
+        ObservabilityService { inner }
    }
 }

 #[derive(Clone)]
-struct ObservabilityLayerService<S> {
+struct ObservabilityService<S> {
    inner: S,
 }

-#[derive(Clone, Copy)]
-struct ReceivedAt(Instant);
-
-impl<S: tonic::server::NamedService> tonic::server::NamedService for ObservabilityLayerService<S> {
-    const NAME: &'static str = S::NAME; // propagate inner service name
-}
-
-impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
+impl<S, B> tower::Service<http::Request<B>> for ObservabilityService<S>
 where
    S: tower::Service<http::Request<B>>,
    S::Future: Send + 'static,
@@ -3851,23 +3805,16 @@ where
    type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;

    fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
-        // Record the request start time as a request extension.
-        //
-        // TODO: we should start a timer here instead, but it currently requires a timeline handle
-        // and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
+        // Stash the request timestamp as a ReceivedAt extension.
+        // TODO: start a timer here instead.
        req.extensions_mut().insert(ReceivedAt(Instant::now()));

        // Create a basic tracing span. Enter the span for the current thread (to use it for inner
-        // sync code like interceptors), and instrument the future (to use it for inner async code
-        // like the page service itself).
-        //
-        // The instrument() call below is not sufficient. It only affects the returned future, and
-        // only takes effect when the caller polls it. Any sync code executed when we call
-        // self.inner.call() below (such as interceptors) runs outside of the returned future, and
-        // is not affected by it. We therefore have to enter the span on the current thread too.
+        // non-async code like interceptors), and instrument the future (to use it for inner async
+        // code like the service itself).
        let span = info_span!(
            "grpc:pageservice",
-            // Set by TenantMetadataInterceptor.
+            // These are set later by TenantMetadataInterceptor.
            tenant_id = field::Empty,
            timeline_id = field::Empty,
            shard_id = field::Empty,
@@ -3882,7 +3829,14 @@ where
    }
 }

-/// gRPC interceptor that decodes tenant metadata and stores it as request extensions of type
+impl<S: tonic::server::NamedService> tonic::server::NamedService for ObservabilityService<S> {
+    const NAME: &'static str = S::NAME; // propagate service name
+}
+
+#[derive(Clone)]
+struct ReceivedAt(Instant);
+
+/// gRPC interceptor that decodes tenant metadata and stores it as extensions of type
 /// TenantTimelineId and ShardIndex.
 #[derive(Clone)]
 struct TenantMetadataInterceptor;
@@ -3910,23 +3864,33 @@ impl tonic::service::Interceptor for TenantMetadataInterceptor {
            .map_err(|_| tonic::Status::invalid_argument("invalid neon-timeline-id"))?;

        // Decode the shard ID.
-        let shard_id = req
+        let shard_index = req
            .metadata()
            .get("neon-shard-id")
            .ok_or_else(|| tonic::Status::invalid_argument("missing neon-shard-id"))?
            .to_str()
            .map_err(|_| tonic::Status::invalid_argument("invalid neon-shard-id"))?;
-        let shard_id = ShardIndex::from_str(shard_id)
+        let shard_index = ShardIndex::from_str(shard_index)
            .map_err(|_| tonic::Status::invalid_argument("invalid neon-shard-id"))?;

        // Stash them in the request.
        let extensions = req.extensions_mut();
        extensions.insert(TenantTimelineId::new(tenant_id, timeline_id));
-        extensions.insert(shard_id);
+        extensions.insert(shard_index);
+
+        // Decorate the tracing span. This doesn't run in an async context, so it can't use
+        // tracing::Span::current().
+        let tsid = TenantShardId {
+            tenant_id,
+            shard_number: shard_index.shard_number,
+            shard_count: shard_index.shard_count,
+        };
+        let shard_id = tsid.shard_slug();

-        // Decorate the tracing span.
        span_record!(%tenant_id, %timeline_id, %shard_id);

+        info!("YYY interceptor");
+
        Ok(req)
    }
 }
@@ -3950,7 +3914,6 @@ impl tonic::service::Interceptor for TenantAuthInterceptor {
            return Ok(req);
        };

-        // Fetch the tenant ID from the request extensions (set by TenantMetadataInterceptor).
        let TenantTimelineId { tenant_id, .. } = *extract::<TenantTimelineId>(&req);

        // Fetch and decode the JWT token.
@@ -3978,21 +3941,6 @@ impl tonic::service::Interceptor for TenantAuthInterceptor {
    }
 }

-/// Extracts the given type from the request extensions, or panics if it is missing.
-fn extract<T: Send + Sync + 'static>(req: &tonic::Request<impl Any>) -> &T {
-    extract_from(req.extensions())
-}
-
-/// Extract the given type from the request extensions, or panics if it is missing. This variant
-/// can extract both from a tonic::Request and http::Request.
-fn extract_from<T: Send + Sync + 'static>(ext: &http::Extensions) -> &T {
-    let Some(value) = ext.get::<T>() else {
-        let name = std::any::type_name::<T>();
-        panic!("extension {name} should be set by middleware");
-    };
-    value
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum GetActiveTimelineError {
    #[error(transparent)]
@@ -4011,60 +3959,6 @@ impl From<GetActiveTimelineError> for QueryError {
    }
 }

-impl From<GetActiveTimelineError> for tonic::Status {
-    fn from(err: GetActiveTimelineError) -> Self {
-        let message = err.to_string();
-        let code = match err {
-            GetActiveTimelineError::Tenant(err) => tonic::Status::from(err).code(),
-            GetActiveTimelineError::Timeline(err) => tonic::Status::from(err).code(),
-        };
-        tonic::Status::new(code, message)
-    }
-}
-
-impl From<GetTimelineError> for tonic::Status {
-    fn from(err: GetTimelineError) -> Self {
-        use tonic::Code;
-        let code = match &err {
-            GetTimelineError::NotFound { .. } => Code::NotFound,
-            GetTimelineError::NotActive { .. } => Code::Unavailable,
-            GetTimelineError::ShuttingDown => Code::Unavailable,
-        };
-        tonic::Status::new(code, err.to_string())
-    }
-}
-
-impl From<GetActiveTenantError> for QueryError {
-    fn from(e: GetActiveTenantError) -> Self {
-        match e {
-            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
-                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
-            ),
-            GetActiveTenantError::Cancelled
-            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                QueryError::Shutdown
-            }
-            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
-            e => QueryError::Other(anyhow::anyhow!(e)),
-        }
-    }
-}
-
-impl From<GetActiveTenantError> for tonic::Status {
-    fn from(err: GetActiveTenantError) -> Self {
-        use tonic::Code;
-        let code = match &err {
-            GetActiveTenantError::Broken(_) => Code::Internal,
-            GetActiveTenantError::Cancelled => Code::Unavailable,
-            GetActiveTenantError::NotFound(_) => Code::NotFound,
-            GetActiveTenantError::SwitchedTenant => Code::Unavailable,
-            GetActiveTenantError::WaitForActiveTimeout { .. } => Code::Unavailable,
-            GetActiveTenantError::WillNotBecomeActive(_) => Code::Unavailable,
-        };
-        tonic::Status::new(code, err.to_string())
-    }
-}
-
 impl From<HandleUpgradeError> for QueryError {
    fn from(e: HandleUpgradeError) -> Self {
        match e {
@@ -4073,11 +3967,25 @@ impl From<HandleUpgradeError> for QueryError {
    }
 }

-impl From<HandleUpgradeError> for tonic::Status {
-    fn from(err: HandleUpgradeError) -> Self {
-        match err {
-            HandleUpgradeError::ShutDown => tonic::Status::unavailable("timeline is shutting down"),
-        }
+impl From<GetActiveTimelineError> for tonic::Status {
+    fn from(err: GetActiveTimelineError) -> Self {
+        use tonic::Code;
+        let code = match &err {
+            GetActiveTimelineError::Tenant(err) => match err {
+                GetActiveTenantError::Broken(_) => Code::Internal,
+                GetActiveTenantError::Cancelled => Code::Unavailable,
+                GetActiveTenantError::NotFound(_) => Code::NotFound,
+                GetActiveTenantError::SwitchedTenant => Code::Unavailable,
+                GetActiveTenantError::WaitForActiveTimeout { .. } => Code::Unavailable,
+                GetActiveTenantError::WillNotBecomeActive(_) => Code::Unavailable,
+            },
+            GetActiveTimelineError::Timeline(err) => match err {
+                GetTimelineError::NotFound { .. } => Code::NotFound,
+                GetTimelineError::NotActive { .. } => Code::Unavailable,
+                GetTimelineError::ShuttingDown => Code::Unavailable,
+            },
+        };
+        tonic::Status::new(code, format!("{err}"))
    }
 }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -431,10 +431,10 @@ impl Timeline {
                        GetVectoredError::InvalidLsn(e) => {
                            Err(anyhow::anyhow!("invalid LSN: {e:?}").into())
                        }
-                        // NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys
+                        // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::Oversized(err, max) => {
-                            Err(anyhow::anyhow!("batching oversized: {err} > {max}").into())
+                        GetVectoredError::Oversized(err) => {
+                            Err(anyhow::anyhow!("batching oversized: {err:?}").into())
                        }
                    };

@@ -471,19 +471,8 @@ impl Timeline {

        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

-        if rels.is_empty() {
-            return Ok(0);
-        }
-
-        // Pre-deserialize the rel directory to avoid duplicated work in `get_relsize_cached`.
-        let reldir_key = rel_dir_to_key(spcnode, dbnode);
-        let buf = version.get(self, reldir_key, ctx).await?;
-        let reldir = RelDirectory::des(&buf)?;
-
        for rel in rels {
-            let n_blocks = self
-                .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), ctx)
-                .await?;
+            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
@@ -498,19 +487,6 @@ impl Timeline {
        tag: RelTag,
        version: Version<'_>,
        ctx: &RequestContext,
-    ) -> Result<BlockNumber, PageReconstructError> {
-        self.get_rel_size_in_reldir(tag, version, None, ctx).await
-    }
-
-    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
-    ///
-    /// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`.
-    pub(crate) async fn get_rel_size_in_reldir(
-        &self,
-        tag: RelTag,
-        version: Version<'_>,
-        deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
-        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
            return Err(PageReconstructError::Other(
@@ -523,9 +499,7 @@ impl Timeline {
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self
-                .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx)
-                .await?
+            && !self.get_rel_exists(tag, version, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -547,28 +521,11 @@ impl Timeline {
    ///
    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
    /// the shard stores pages for.
-    ///
    pub(crate) async fn get_rel_exists(
        &self,
        tag: RelTag,
        version: Version<'_>,
        ctx: &RequestContext,
-    ) -> Result<bool, PageReconstructError> {
-        self.get_rel_exists_in_reldir(tag, version, None, ctx).await
-    }
-
-    /// Does the relation exist? With a cached deserialized `RelDirectory`.
-    ///
-    /// There are some cases where the caller loops across all relations. In that specific case,
-    /// the caller should obtain the deserialized `RelDirectory` first and then call this function
-    /// to avoid duplicated work of deserliazation. This is a hack and should be removed by introducing
-    /// a new API (e.g., `get_rel_exists_batched`).
-    pub(crate) async fn get_rel_exists_in_reldir(
-        &self,
-        tag: RelTag,
-        version: Version<'_>,
-        deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
-        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
            return Err(PageReconstructError::Other(
@@ -611,17 +568,6 @@ impl Timeline {
        // fetch directory listing (old)

        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-
-        if let Some((cached_key, dir)) = deserialized_reldir_v1 {
-            if cached_key == key {
-                return Ok(dir.rels.contains(&(tag.relnode, tag.forknum)));
-            } else if cfg!(test) || cfg!(feature = "testing") {
-                panic!("cached reldir key mismatch: {cached_key} != {key}");
-            } else {
-                warn!("cached reldir key mismatch: {cached_key} != {key}");
-            }
-            // Fallback to reading the directory from the datadir.
-        }
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
@@ -719,7 +665,7 @@ impl Timeline {

        let batches = keyspace.partition(
            self.get_shard_identity(),
-            self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
@@ -959,7 +905,7 @@ impl Timeline {

            let batches = keyspace.partition(
                self.get_shard_identity(),
-                self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -300,7 +300,7 @@ pub struct TenantShard {
    ///   as in progress.
    /// * Imported timelines are removed when the storage controller calls the post timeline
    ///   import activation endpoint.
-    timelines_importing: std::sync::Mutex<HashMap<TimelineId, Arc<ImportingTimeline>>>,
+    timelines_importing: std::sync::Mutex<HashMap<TimelineId, ImportingTimeline>>,

    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
    /// been either downloaded or uploaded. Always Some after tenant attach.
@@ -383,7 +383,7 @@ pub struct TenantShard {

    l0_flush_global_state: L0FlushGlobalState,

-    pub(crate) feature_resolver: FeatureResolver,
+    feature_resolver: FeatureResolver,
 }
 impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -672,7 +672,6 @@ pub enum MaybeOffloaded {
 pub enum TimelineOrOffloaded {
    Timeline(Arc<Timeline>),
    Offloaded(Arc<OffloadedTimeline>),
-    Importing(Arc<ImportingTimeline>),
 }

 impl TimelineOrOffloaded {
@@ -684,9 +683,6 @@ impl TimelineOrOffloaded {
            TimelineOrOffloaded::Offloaded(offloaded) => {
                TimelineOrOffloadedArcRef::Offloaded(offloaded)
            }
-            TimelineOrOffloaded::Importing(importing) => {
-                TimelineOrOffloadedArcRef::Importing(importing)
-            }
        }
    }
    pub fn tenant_shard_id(&self) -> TenantShardId {
@@ -699,16 +695,12 @@ impl TimelineOrOffloaded {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress,
            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
-            TimelineOrOffloaded::Importing(importing) => &importing.delete_progress,
        }
    }
    fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
            TimelineOrOffloaded::Offloaded(_offloaded) => None,
-            TimelineOrOffloaded::Importing(importing) => {
-                Some(importing.timeline.remote_client.clone())
-            }
        }
    }
 }
@@ -716,7 +708,6 @@ impl TimelineOrOffloaded {
 pub enum TimelineOrOffloadedArcRef<'a> {
    Timeline(&'a Arc<Timeline>),
    Offloaded(&'a Arc<OffloadedTimeline>),
-    Importing(&'a Arc<ImportingTimeline>),
 }

 impl TimelineOrOffloadedArcRef<'_> {
@@ -724,14 +715,12 @@ impl TimelineOrOffloadedArcRef<'_> {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
-            TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.tenant_shard_id,
        }
    }
    pub fn timeline_id(&self) -> TimelineId {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
-            TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.timeline_id,
        }
    }
 }
@@ -748,12 +737,6 @@ impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
    }
 }

-impl<'a> From<&'a Arc<ImportingTimeline>> for TimelineOrOffloadedArcRef<'a> {
-    fn from(timeline: &'a Arc<ImportingTimeline>) -> Self {
-        Self::Importing(timeline)
-    }
-}
-
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
    #[error("Timeline is shutting down")]
@@ -1806,25 +1789,20 @@ impl TenantShard {
                    },
                ) => {
                    let timeline_id = timeline.timeline_id;
-                    let import_task_gate = Gate::default();
-                    let import_task_guard = import_task_gate.enter().unwrap();
                    let import_task_handle =
                        tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
                            timeline.clone(),
                            import_pgdata,
                            guard,
-                            import_task_guard,
                            ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
                        ));

                    let prev = self.timelines_importing.lock().unwrap().insert(
                        timeline_id,
-                        Arc::new(ImportingTimeline {
+                        ImportingTimeline {
                            timeline: timeline.clone(),
                            import_task_handle,
-                            import_task_gate,
-                            delete_progress: TimelineDeleteProgress::default(),
-                        }),
+                        },
                    );

                    assert!(prev.is_none());
@@ -2442,17 +2420,6 @@ impl TenantShard {
            .collect()
    }

-    /// Lists timelines the tenant contains.
-    /// It's up to callers to omit certain timelines that are not considered ready for use.
-    pub fn list_importing_timelines(&self) -> Vec<Arc<ImportingTimeline>> {
-        self.timelines_importing
-            .lock()
-            .unwrap()
-            .values()
-            .map(Arc::clone)
-            .collect()
-    }
-
    /// Lists timelines the tenant manages, including offloaded ones.
    ///
    /// It's up to callers to omit certain timelines that are not considered ready for use.
@@ -2886,25 +2853,19 @@ impl TenantShard {

        let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();

-        let import_task_gate = Gate::default();
-        let import_task_guard = import_task_gate.enter().unwrap();
-
        let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task(
            timeline.clone(),
            index_part,
            timeline_create_guard,
-            import_task_guard,
            timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
        ));

        let prev = self.timelines_importing.lock().unwrap().insert(
            timeline.timeline_id,
-            Arc::new(ImportingTimeline {
+            ImportingTimeline {
                timeline: timeline.clone(),
                import_task_handle,
-                import_task_gate,
-                delete_progress: TimelineDeleteProgress::default(),
-            }),
+            },
        );

        // Idempotency is enforced higher up the stack
@@ -2963,7 +2924,6 @@ impl TenantShard {
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        timeline_create_guard: TimelineCreateGuard,
-        _import_task_guard: GateGuard,
        ctx: RequestContext,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
@@ -3875,9 +3835,6 @@ impl TenantShard {
                        .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
                    Arc::new(remote_client)
                }
-                TimelineOrOffloadedArcRef::Importing(_) => {
-                    unreachable!("Importing timelines are not included in the iterator")
-                }
            };

            // Shut down the timeline's remote client: this means that the indices we write
@@ -5087,14 +5044,6 @@ impl TenantShard {
                info!("timeline already exists but is offloaded");
                Err(CreateTimelineError::Conflict)
            }
-            Err(TimelineExclusionError::AlreadyExists {
-                existing: TimelineOrOffloaded::Importing(_existing),
-                ..
-            }) => {
-                // If there's a timeline already importing, then we would hit
-                // the [`TimelineExclusionError::AlreadyCreating`] branch above.
-                unreachable!("Importing timelines hold the creation guard")
-            }
            Err(TimelineExclusionError::AlreadyExists {
                existing: TimelineOrOffloaded::Timeline(existing),
                arg,
@@ -5832,7 +5781,6 @@ pub(crate) mod harness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: pageserver_api::models::TenantConfig,
        pub tenant_shard_id: TenantShardId,
-        pub shard_identity: ShardIdentity,
        pub generation: Generation,
        pub shard: ShardIndex,
        pub remote_storage: GenericRemoteStorage,
@@ -5900,7 +5848,6 @@ pub(crate) mod harness {
                conf,
                tenant_conf,
                tenant_shard_id,
-                shard_identity,
                generation,
                shard,
                remote_storage,
@@ -5962,7 +5909,8 @@ pub(crate) mod harness {
                    &ShardParameters::default(),
                ))
                .unwrap(),
-                self.shard_identity,
+                // This is a legacy/test code path: sharding isn't supported here.
+                ShardIdentity::unsharded(),
                Some(walredo_mgr),
                self.tenant_shard_id,
                self.remote_storage.clone(),
@@ -6084,7 +6032,6 @@ mod tests {
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
    use utils::id::TenantId;
-    use utils::shard::{ShardCount, ShardNumber};

    use super::*;
    use crate::DEFAULT_PG_VERSION;
@@ -7197,7 +7144,7 @@ mod tests {
            let end = desc
                .key_range
                .start
-                .add(tenant.conf.max_get_vectored_keys.get() as u32);
+                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
            reads.push(KeySpace {
                ranges: vec![start..end],
            });
@@ -9420,77 +9367,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_failed_flush_should_not_update_disk_consistent_lsn() -> anyhow::Result<()> {
-        //
-        // Setup
-        //
-        let harness = TenantHarness::create_custom(
-            "test_failed_flush_should_not_upload_disk_consistent_lsn",
-            pageserver_api::models::TenantConfig::default(),
-            TenantId::generate(),
-            ShardIdentity::new(ShardNumber(0), ShardCount(4), ShardStripeSize(128)).unwrap(),
-            Generation::new(1),
-        )
-        .await?;
-        let (tenant, ctx) = harness.load().await;
-
-        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        assert_eq!(timeline.get_shard_identity().count, ShardCount(4));
-        let mut writer = timeline.writer().await;
-        writer
-            .put(
-                *TEST_KEY,
-                Lsn(0x20),
-                &Value::Image(test_img("foo at 0x20")),
-                &ctx,
-            )
-            .await?;
-        writer.finish_write(Lsn(0x20));
-        drop(writer);
-        timeline.freeze_and_flush().await.unwrap();
-
-        timeline.remote_client.wait_completion().await.unwrap();
-        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
-        let remote_consistent_lsn = timeline.get_remote_consistent_lsn_projected();
-        assert_eq!(Some(disk_consistent_lsn), remote_consistent_lsn);
-
-        //
-        // Test
-        //
-
-        let mut writer = timeline.writer().await;
-        writer
-            .put(
-                *TEST_KEY,
-                Lsn(0x30),
-                &Value::Image(test_img("foo at 0x30")),
-                &ctx,
-            )
-            .await?;
-        writer.finish_write(Lsn(0x30));
-        drop(writer);
-
-        fail::cfg(
-            "flush-layer-before-update-remote-consistent-lsn",
-            "return()",
-        )
-        .unwrap();
-
-        let flush_res = timeline.freeze_and_flush().await;
-        // if flush failed, the disk/remote consistent LSN should not be updated
-        assert!(flush_res.is_err());
-        assert_eq!(disk_consistent_lsn, timeline.get_disk_consistent_lsn());
-        assert_eq!(
-            remote_consistent_lsn,
-            timeline.get_remote_consistent_lsn_projected()
-        );
-
-        Ok(())
-    }
-
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
@@ -11260,11 +11136,11 @@ mod tests {
                let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
                let mut used_keys: HashSet<Key> = HashSet::default();

-                while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
+                while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));

-                    while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
+                    while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
                        if used_keys.contains(&selected_key)
                            || selected_key >= start_key.add(KEY_DIMENSION_SIZE)
                        {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1348,21 +1348,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    pub(crate) fn schedule_unlinking_of_layers_from_index_part<I>(
-        self: &Arc<Self>,
-        names: I,
-    ) -> Result<(), NotInitialized>
-    where
-        I: IntoIterator<Item = LayerName>,
-    {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-
-        Ok(())
-    }
-
    /// Update the remote index file, removing the to-be-deleted files from the index,
    /// allowing scheduling of actual deletions later.
    fn schedule_unlinking_of_layers_from_index_part0<I>(
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -817,8 +817,8 @@ pub(crate) enum GetVectoredError {
    #[error("timeline shutting down")]
    Cancelled,

-    #[error("requested too many keys: {0} > {1}")]
-    Oversized(u64, u64),
+    #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    Oversized(u64),

    #[error("requested at invalid LSN: {0}")]
    InvalidLsn(Lsn),
@@ -950,18 +950,6 @@ pub(crate) enum WaitLsnError {
    Timeout(String),
 }

-impl From<WaitLsnError> for tonic::Status {
-    fn from(err: WaitLsnError) -> Self {
-        use tonic::Code;
-        let code = match &err {
-            WaitLsnError::Timeout(_) => Code::Internal,
-            WaitLsnError::BadState(_) => Code::Internal,
-            WaitLsnError::Shutdown => Code::Unavailable,
-        };
-        tonic::Status::new(code, err.to_string())
-    }
-}
-
 // The impls below achieve cancellation mapping for errors.
 // Perhaps there's a way of achieving this with less cruft.

@@ -1019,7 +1007,7 @@ impl From<GetVectoredError> for PageReconstructError {
        match e {
            GetVectoredError::Cancelled => PageReconstructError::Cancelled,
            GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
-            err @ GetVectoredError::Oversized(_, _) => PageReconstructError::Other(err.into()),
+            err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
            GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
            GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
            GetVectoredError::Other(err) => PageReconstructError::Other(err),
@@ -1199,6 +1187,7 @@ impl Timeline {
        }
    }

+    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;

    /// Look up multiple page versions at a given LSN
@@ -1213,12 +1202,9 @@ impl Timeline {
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
        let total_keyspace = query.total_keyspace();

-        let key_count = total_keyspace.total_raw_size();
-        if key_count > self.conf.max_get_vectored_keys.get() {
-            return Err(GetVectoredError::Oversized(
-                key_count as u64,
-                self.conf.max_get_vectored_keys.get() as u64,
-            ));
+        let key_count = total_keyspace.total_raw_size().try_into().unwrap();
+        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
+            return Err(GetVectoredError::Oversized(key_count));
        }

        for range in &total_keyspace.ranges {
@@ -4781,10 +4767,7 @@ impl Timeline {
                    || !flushed_to_lsn.is_valid()
            );

-            if flushed_to_lsn < frozen_to_lsn
-                && self.shard_identity.count.count() > 1
-                && result.is_ok()
-            {
+            if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
                // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
                // to us via layer_flush_start_rx, then advance it here.
                //
@@ -4963,10 +4946,6 @@ impl Timeline {
            return Err(FlushLayerError::Cancelled);
        }

-        fail_point!("flush-layer-before-update-remote-consistent-lsn", |_| {
-            Err(FlushLayerError::Other(anyhow!("failpoint").into()))
-        });
-
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);

        // The new on-disk layers are now in the layer map. We can remove the
@@ -5272,7 +5251,7 @@ impl Timeline {
                key = key.next();

                // Maybe flush `key_rest_accum`
-                if key_request_accum.raw_size() >= self.conf.max_get_vectored_keys.get() as u64
+                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                    || (last_key_in_range && key_request_accum.raw_size() > 0)
                {
                    let query =
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -206,8 +206,8 @@ pub struct GcCompactionQueue {
 }

 static CONCURRENT_GC_COMPACTION_TASKS: Lazy<Arc<Semaphore>> = Lazy::new(|| {
-    // Only allow one timeline on one pageserver to run gc compaction at a time.
-    Arc::new(Semaphore::new(1))
+    // Only allow two timelines on one pageserver to run gc compaction at a time.
+    Arc::new(Semaphore::new(2))
 });

 impl GcCompactionQueue {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -121,7 +121,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
    // This observes the locking order between timelines and timelines_offloaded
    let mut timelines = tenant.timelines.lock().unwrap();
    let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-    let mut timelines_importing = tenant.timelines_importing.lock().unwrap();
    let offloaded_children_exist = timelines_offloaded
        .iter()
        .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
@@ -151,12 +150,8 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
                .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
            offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
        }
-        TimelineOrOffloaded::Importing(importing) => {
-            timelines_importing.remove(&importing.timeline.timeline_id);
-        }
    }

-    drop(timelines_importing);
    drop(timelines_offloaded);
    drop(timelines);

@@ -208,17 +203,8 @@ impl DeleteTimelineFlow {
        guard.mark_in_progress()?;

        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-        // TODO(vlad): shut down imported timeline here
-        match &timeline {
-            TimelineOrOffloaded::Timeline(timeline) => {
-                timeline.shutdown(super::ShutdownMode::Hard).await;
-            }
-            TimelineOrOffloaded::Importing(importing) => {
-                importing.shutdown().await;
-            }
-            TimelineOrOffloaded::Offloaded(_offloaded) => {
-                // Nothing to shut down in this case
-            }
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.shutdown(super::ShutdownMode::Hard).await;
        }

        tenant.gc_block.before_delete(&timeline.timeline_id());
@@ -403,18 +389,10 @@ impl DeleteTimelineFlow {
            Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
        });

-        match timeline {
-            TimelineOrOffloaded::Timeline(timeline) => {
-                delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
-            }
-            TimelineOrOffloaded::Importing(importing) => {
-                delete_local_timeline_directory(conf, tenant.tenant_shard_id, &importing.timeline)
-                    .await;
-            }
-            TimelineOrOffloaded::Offloaded(_offloaded) => {
-                // Offloaded timelines have no local state
-                // TODO: once we persist offloaded information, delete the timeline from there, too
-            }
+        // Offloaded timelines have no local state
+        // TODO: once we persist offloaded information, delete the timeline from there, too
+        if let TimelineOrOffloaded::Timeline(timeline) = timeline {
+            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
        }

        fail::fail_point!("timeline-delete-after-rm", |_| {
@@ -473,16 +451,12 @@ pub(super) fn make_timeline_delete_guard(
    // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
    let timelines = tenant.timelines.lock().unwrap();
    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-    let timelines_importing = tenant.timelines_importing.lock().unwrap();

    let timeline = match timelines.get(&timeline_id) {
        Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
        None => match timelines_offloaded.get(&timeline_id) {
            Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-            None => match timelines_importing.get(&timeline_id) {
-                Some(t) => TimelineOrOffloaded::Importing(Arc::clone(t)),
-                None => return Err(DeleteTimelineError::NotFound),
-            },
+            None => return Err(DeleteTimelineError::NotFound),
        },
    };

--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -8,10 +8,8 @@ use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use utils::lsn::Lsn;
-use utils::pausable_failpoint;
-use utils::sync::gate::Gate;

-use super::{Timeline, TimelineDeleteProgress};
+use super::Timeline;
 use crate::context::RequestContext;
 use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::tenant::metadata::TimelineMetadata;
@@ -21,23 +19,15 @@ mod importbucket_client;
 mod importbucket_format;
 pub(crate) mod index_part_format;

-pub struct ImportingTimeline {
+pub(crate) struct ImportingTimeline {
    pub import_task_handle: JoinHandle<()>,
-    pub import_task_gate: Gate,
    pub timeline: Arc<Timeline>,
-    pub delete_progress: TimelineDeleteProgress,
-}
-
-impl std::fmt::Debug for ImportingTimeline {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "ImportingTimeline<{}>", self.timeline.timeline_id)
-    }
 }

 impl ImportingTimeline {
-    pub async fn shutdown(&self) {
+    pub(crate) async fn shutdown(self) {
        self.import_task_handle.abort();
-        self.import_task_gate.close().await;
+        let _ = self.import_task_handle.await;

        self.timeline.remote_client.shutdown().await;
    }
@@ -106,15 +96,11 @@ pub async fn doit(
                );
            }

-            tracing::info!("Import plan executed. Flushing remote changes and notifying storcon");
-
            timeline
                .remote_client
                .schedule_index_upload_for_file_changes()?;
            timeline.remote_client.wait_completion().await?;

-            pausable_failpoint!("import-timeline-pre-success-notify-pausable");
-
            // Communicate that shard is done.
            // Ensure at-least-once delivery of the upcall to storage controller
            // before we mark the task as done and never come here again.
@@ -201,8 +187,8 @@ async fn prepare_import(
        .await;
        match res {
            Ok(_) => break,
-            Err(_err) => {
-                info!("indefinitely waiting for pgdata to finish");
+            Err(err) => {
+                info!(?err, "indefinitely waiting for pgdata to finish");
                if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
                    .await
                    .is_ok()
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -11,14 +11,25 @@
 //! - => S3 as the source for the PGDATA instead of local filesystem
 //!
 //! TODOs before productionization:
+//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
+//!   => produced image layers likely too small.
 //! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
+//! - asserts / unwraps need to be replaced with errors
+//! - don't trust remote objects will be small (=prevent OOMs in those cases)
+//!     - limit all in-memory buffers in size, or download to disk and read from there
+//! - limit task concurrency
+//! - generally play nice with other tenants in the system
+//!   - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
+//!   - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
+//! - integrate with layer eviction system
+//! - audit for Tenant::cancel nor Timeline::cancel responsivity
+//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
 //!
 //! An incomplete set of TODOs from the Hackathon:
 //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)

 use std::collections::HashSet;
 use std::hash::{Hash, Hasher};
-use std::num::NonZeroUsize;
 use std::ops::Range;
 use std::sync::Arc;

@@ -32,7 +43,7 @@ use pageserver_api::key::{
    rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
    slru_segment_size_to_key,
 };
-use pageserver_api::keyspace::{ShardedRange, singleton_range};
+use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
 use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -89,24 +100,8 @@ async fn run_v1(
        tasks: Vec::default(),
    };

-    // Use the job size limit encoded in the progress if we are resuming an import.
-    // This ensures that imports have stable plans even if the pageserver config changes.
-    let import_config = {
-        match &import_progress {
-            Some(progress) => {
-                let base = &timeline.conf.timeline_import_config;
-                TimelineImportConfig {
-                    import_job_soft_size_limit: NonZeroUsize::new(progress.job_soft_size_limit)
-                        .unwrap(),
-                    import_job_concurrency: base.import_job_concurrency,
-                    import_job_checkpoint_threshold: base.import_job_checkpoint_threshold,
-                }
-            }
-            None => timeline.conf.timeline_import_config.clone(),
-        }
-    };
-
-    let plan = planner.plan(&import_config).await?;
+    let import_config = &timeline.conf.timeline_import_config;
+    let plan = planner.plan(import_config).await?;

    // Hash the plan and compare with the hash of the plan we got back from the storage controller.
    // If the two match, it means that the planning stage had the same output.
@@ -130,16 +125,8 @@ async fn run_v1(

    pausable_failpoint!("import-timeline-pre-execute-pausable");

-    let jobs_count = import_progress.as_ref().map(|p| p.jobs);
    let start_from_job_idx = import_progress.map(|progress| progress.completed);
-
-    tracing::info!(
-        start_from_job_idx=?start_from_job_idx,
-        jobs=?jobs_count,
-        "Executing import plan"
-    );
-
-    plan.execute(timeline, start_from_job_idx, plan_hash, &import_config, ctx)
+    plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx)
        .await
 }

@@ -163,7 +150,6 @@ impl Planner {
    /// This function is and must remain pure: given the same input, it will generate the same import plan.
    async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result<Plan> {
        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
-        anyhow::ensure!(pgdata_lsn.is_valid());

        let datadir = PgDataDir::new(&self.storage).await?;

@@ -246,22 +232,14 @@ impl Planner {
        });

        // Assigns parts of key space to later parallel jobs
-        // Note: The image layers produced here may have gaps, meaning,
-        //       there is not an image for each key in the layer's key range.
-        //       The read path stops traversal at the first image layer, regardless
-        //       of whether a base image has been found for a key or not.
-        //       (Concept of sparse image layers doesn't exist.)
-        //       This behavior is exactly right for the base image layers we're producing here.
-        //       But, since no other place in the code currently produces image layers with gaps,
-        //       it seems noteworthy.
        let mut last_end_key = Key::MIN;
        let mut current_chunk = Vec::new();
        let mut current_chunk_size: usize = 0;
        let mut jobs = Vec::new();
        for task in std::mem::take(&mut self.tasks).into_iter() {
-            let task_size = task.total_size(&self.shard);
-            let projected_chunk_size = current_chunk_size.saturating_add(task_size);
-            if projected_chunk_size > import_config.import_job_soft_size_limit.into() {
+            if current_chunk_size + task.total_size()
+                > import_config.import_job_soft_size_limit.into()
+            {
                let key_range = last_end_key..task.key_range().start;
                jobs.push(ChunkProcessingJob::new(
                    key_range.clone(),
@@ -271,7 +249,7 @@ impl Planner {
                last_end_key = key_range.end;
                current_chunk_size = 0;
            }
-            current_chunk_size = current_chunk_size.saturating_add(task_size);
+            current_chunk_size += task.total_size();
            current_chunk.push(task);
        }
        jobs.push(ChunkProcessingJob::new(
@@ -471,13 +449,10 @@ impl Plan {
                            last_completed_job_idx = job_idx;

                            if last_completed_job_idx % checkpoint_every == 0 {
-                                tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status");
-
                                let progress = ShardImportProgressV1 {
                                    jobs: jobs_in_plan,
                                    completed: last_completed_job_idx,
                                    import_plan_hash,
-                                    job_soft_size_limit: import_config.import_job_soft_size_limit.into(),
                                };

                                timeline.remote_client.schedule_index_upload_for_file_changes()?;
@@ -611,18 +586,18 @@ impl PgDataDirDb {
                };

                let path = datadir_path.join(rel_tag.to_segfile_name(segno));
-                anyhow::ensure!(filesize % BLCKSZ as usize == 0);
+                assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
                let nblocks = filesize / BLCKSZ as usize;

-                Ok(PgDataDirDbFile {
+                PgDataDirDbFile {
                    path,
                    filesize,
                    rel_tag,
                    segno,
                    nblocks: Some(nblocks), // first non-cummulative sizes
-                })
+                }
            })
-            .collect::<anyhow::Result<_, _>>()?;
+            .collect();

        // Set cummulative sizes. Do all of that math here, so that later we could easier
        // parallelize over segments and know with which segments we need to write relsize
@@ -657,22 +632,12 @@ impl PgDataDirDb {
 trait ImportTask {
    fn key_range(&self) -> Range<Key>;

-    fn total_size(&self, shard_identity: &ShardIdentity) -> usize {
-        let range = ShardedRange::new(self.key_range(), shard_identity);
-        let page_count = range.page_count();
-        if page_count == u32::MAX {
-            tracing::warn!(
-                "Import task has non contiguous key range: {}..{}",
-                self.key_range().start,
-                self.key_range().end
-            );
-
-            // Tasks should operate on contiguous ranges. It is unexpected for
-            // ranges to violate this assumption. Calling code handles this by mapping
-            // any task on a non contiguous range to its own image layer.
-            usize::MAX
+    fn total_size(&self) -> usize {
+        // TODO: revisit this
+        if is_contiguous_range(&self.key_range()) {
+            contiguous_range_len(&self.key_range()) as usize * 8192
        } else {
-            page_count as usize * 8192
+            u32::MAX as usize
        }
    }

@@ -770,8 +735,6 @@ impl ImportTask for ImportRelBlocksTask {
        layer_writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
-        const MAX_BYTE_RANGE_SIZE: usize = 4 * 1024 * 1024;
-
        debug!("Importing relation file");

        let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
@@ -796,7 +759,7 @@ impl ImportTask for ImportRelBlocksTask {
                assert_eq!(key.len(), 1);
                assert!(!acc.is_empty());
                assert!(acc_end > acc_start);
-                if acc_end == start && end - acc_start <= MAX_BYTE_RANGE_SIZE {
+                if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
                    acc.push(key.pop().unwrap());
                    Ok((acc, acc_start, end))
                } else {
@@ -811,8 +774,8 @@ impl ImportTask for ImportRelBlocksTask {
                .get_range(&self.path, range_start.into_u64(), range_end.into_u64())
                .await?;
            let mut buf = Bytes::from(range_buf);
+            // TODO: batched writes
            for key in keys {
-                // The writer buffers writes internally
                let image = buf.split_to(8192);
                layer_writer.put_image(key, image, ctx).await?;
                nimages += 1;
@@ -865,9 +828,6 @@ impl ImportTask for ImportSlruBlocksTask {
        debug!("Importing SLRU segment file {}", self.path);
        let buf = self.storage.get(&self.path).await?;

-        // TODO(vlad): Does timestamp to LSN work for imported timelines?
-        // Probably not since we don't append the `xact_time` to it as in
-        // [`WalIngest::ingest_xact_record`].
        let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
        let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
        let mut blknum = start_blk;
@@ -1004,15 +964,6 @@ impl ChunkProcessingJob {
            .cloned();
        match existing_layer {
            Some(existing) => {
-                // Unlink the remote layer from the index without scheduling its deletion.
-                // When `existing_layer` drops [`LayerInner::drop`] will schedule its deletion from
-                // remote storage, but that assumes that the layer was unlinked from the index first.
-                timeline
-                    .remote_client
-                    .schedule_unlinking_of_layers_from_index_part(std::iter::once(
-                        existing.layer_desc().layer_name(),
-                    ))?;
-
                guard.open_mut()?.rewrite_layers(
                    &[(existing.clone(), resident_layer.clone())],
                    &[],
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -6,7 +6,7 @@ use bytes::Bytes;
 use postgres_ffi::ControlFileData;
 use remote_storage::{
    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
-    ListingObject, RemotePath, RemoteStorageConfig,
+    ListingObject, RemotePath,
 };
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -22,9 +22,11 @@ pub async fn new(
    location: &index_part_format::Location,
    cancel: CancellationToken,
 ) -> Result<RemoteStorageWrapper, anyhow::Error> {
-    // Downloads should be reasonably sized. We do ranged reads for relblock raw data
-    // and full reads for SLRU segments which are bounded by Postgres.
-    let timeout = RemoteStorageConfig::DEFAULT_TIMEOUT;
+    // FIXME: we probably want some timeout, and we might be able to assume the max file
+    // size on S3 is 1GiB (postgres segment size). But the problem is that the individual
+    // downloaders don't know enough about concurrent downloads to make a guess on the
+    // expected bandwidth and resulting best timeout.
+    let timeout = std::time::Duration::from_secs(24 * 60 * 60);
    let location_storage = match location {
        #[cfg(feature = "testing")]
        index_part_format::Location::LocalFs { path } => {
@@ -48,12 +50,9 @@ pub async fn new(
                            .import_pgdata_aws_endpoint_url
                            .clone()
                            .map(|url| url.to_string()), //  by specifying None here, remote_storage/aws-sdk-rust will infer from env
-                        // This matches the default import job concurrency. This is managed
-                        // separately from the usual S3 client, but the concern here is bandwidth
-                        // usage.
-                        concurrency_limit: 128.try_into().unwrap(),
-                        max_keys_per_list_response: Some(1000),
-                        upload_storage_class: None, // irrelevant
+                        concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
+                        max_keys_per_list_response: Some(1000),     // TODO: think about this
+                        upload_storage_class: None,                 // irrelevant
                    },
                    timeout,
                )
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -113,7 +113,7 @@ impl WalReceiver {
                }
                connection_manager_state.shutdown().await;
                *loop_status.write().unwrap() = None;
-                info!("task exits");
+                debug!("task exits");
            }
            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
        });
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -297,7 +297,6 @@ pub(super) async fn handle_walreceiver_connection(
    let mut expected_wal_start = startpoint;
    while let Some(replication_message) = {
        select! {
-            biased;
            _ = cancellation.cancelled() => {
                debug!("walreceiver interrupted");
                None
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -17,23 +17,35 @@ pub(super) async fn authenticate(
    config: &'static AuthenticationConfig,
    secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials> {
+    let flow = AuthFlow::new(client);
    let scram_keys = match secret {
        #[cfg(any(test, feature = "testing"))]
        AuthSecret::Md5(_) => {
            debug!("auth endpoint chooses MD5");
-            return Err(auth::AuthError::MalformedPassword("MD5 not supported"));
+            return Err(auth::AuthError::bad_auth_method("MD5"));
        }
        AuthSecret::Scram(secret) => {
            debug!("auth endpoint chooses SCRAM");
+            let scram = auth::Scram(&secret, ctx);

            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
-                AuthFlow::new(client, auth::Scram(&secret, ctx)).authenticate(),
+                async {
+
+                    flow.begin(scram).await.map_err(|error| {
+                        warn!(?error, "error sending scram acknowledgement");
+                        error
+                    })?.authenticate().await.map_err(|error| {
+                        warn!(?error, "error processing scram messages");
+                        error
+                    })
+                }
            )
            .await
-            .inspect_err(|_| warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs()))
-            .map_err(auth::AuthError::user_timeout)?
-            .inspect_err(|error| warn!(?error, "error processing scram messages"))?;
+            .map_err(|e| {
+                warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs());
+                auth::AuthError::user_timeout(e)
+            })??;

            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -2,6 +2,7 @@ use std::fmt;

 use async_trait::async_trait;
 use postgres_client::config::SslMode;
+use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
@@ -15,7 +16,6 @@ use crate::context::RequestContext;
 use crate::control_plane::client::cplane_proxy_v1;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
-use crate::pqproto::BeMessage;
 use crate::proxy::NeonOptions;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
@@ -154,13 +154,11 @@ async fn authenticate(

    // Give user a URL to spawn a new database.
    info!(parent: &span, "sending the auth URL to the user");
-    client.write_message(BeMessage::AuthenticationOk);
-    client.write_message(BeMessage::ParameterStatus {
-        name: b"client_encoding",
-        value: b"UTF8",
-    });
-    client.write_message(BeMessage::NoticeResponse(&greeting));
-    client.flush().await?;
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
+        .write_message(&Be::NoticeResponse(&greeting))
+        .await?;

    // Wait for console response via control plane (see `mgmt`).
    info!(parent: &span, "waiting for console's reply...");
@@ -190,7 +188,7 @@ async fn authenticate(
        }
    }

-    client.write_message(BeMessage::NoticeResponse("Connecting to database."));
+    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

    // This config should be self-contained, because we won't
    // take username or dbname from client's startup message.
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -24,25 +24,23 @@ pub(crate) async fn authenticate_cleartext(
    debug!("cleartext auth flow override is enabled, proceeding");
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

+    // pause the timer while we communicate with the client
+    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+
    let ep = EndpointIdInt::from(&info.endpoint);

-    let auth_flow = AuthFlow::new(
-        client,
-        auth::CleartextPassword {
+    let auth_flow = AuthFlow::new(client)
+        .begin(auth::CleartextPassword {
            secret,
            endpoint: ep,
            pool: config.thread_pool.clone(),
-        },
-    );
-    let auth_outcome = {
-        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
-
-        // cleartext auth is only allowed to the ws/http protocol.
-        // If we're here, we already received the password in the first message.
-        // Scram protocol will be executed on the proxy side.
-        auth_flow.authenticate().await?
-    };
+        })
+        .await?;
+    drop(paused);
+    // cleartext auth is only allowed to the ws/http protocol.
+    // If we're here, we already received the password in the first message.
+    // Scram protocol will be executed on the proxy side.
+    let auth_outcome = auth_flow.authenticate().await?;

    let keys = match auth_outcome {
        sasl::Outcome::Success(key) => key,
@@ -69,7 +67,9 @@ pub(crate) async fn password_hack_no_authentication(
    // pause the timer while we communicate with the client
    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);

-    let payload = AuthFlow::new(client, auth::PasswordHack)
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
        .get_password()
        .await?;

--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -4,31 +4,37 @@ mod hacks;
 pub mod jwt;
 pub mod local;

+use std::net::IpAddr;
 use std::sync::Arc;

 pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::ConsoleRedirectError;
+use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use postgres_client::config::AuthKeys;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};

-use crate::auth::{self, AuthError, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange};
+use crate::auth::credentials::check_peer_addr_is_in_list;
+use crate::auth::{
+    self, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, validate_password_and_exchange,
+};
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
-    self, AccessBlockerFlags, AuthSecret, CachedNodeInfo, ControlPlaneApi, EndpointAccessControl,
-    RoleAccessControl,
+    self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
 use crate::intern::EndpointIdInt;
-use crate::pqproto::BeMessage;
+use crate::metrics::Metrics;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::NeonOptions;
 use crate::proxy::connect_compute::ComputeConnectBackend;
-use crate::rate_limiter::EndpointRateLimiter;
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
 use crate::stream::Stream;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{scram, stream};
@@ -194,6 +200,78 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    }
 }

+#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
+pub struct MaskedIp(IpAddr);
+
+impl MaskedIp {
+    fn new(value: IpAddr, prefix: u8) -> Self {
+        match value {
+            IpAddr::V4(v4) => Self(IpAddr::V4(
+                Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
+            )),
+            IpAddr::V6(v6) => Self(IpAddr::V6(
+                Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
+            )),
+        }
+    }
+}
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
+
+impl AuthenticationConfig {
+    pub(crate) fn check_rate_limit(
+        &self,
+        ctx: &RequestContext,
+        secret: AuthSecret,
+        endpoint: &EndpointId,
+        is_cleartext: bool,
+    ) -> auth::Result<AuthSecret> {
+        // we have validated the endpoint exists, so let's intern it.
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
+
+        // only count the full hash count if password hack or websocket flow.
+        // in other words, if proxy needs to run the hashing
+        let password_weight = if is_cleartext {
+            match &secret {
+                #[cfg(any(test, feature = "testing"))]
+                AuthSecret::Md5(_) => 1,
+                AuthSecret::Scram(s) => s.iterations + 1,
+            }
+        } else {
+            // validating scram takes just 1 hmac_sha_256 operation.
+            1
+        };
+
+        let limit_not_exceeded = self.rate_limiter.check(
+            (
+                endpoint_int,
+                MaskedIp::new(ctx.peer_addr(), self.rate_limit_ip_subnet),
+            ),
+            password_weight,
+        );
+
+        if !limit_not_exceeded {
+            warn!(
+                enabled = self.rate_limiter_enabled,
+                "rate limiting authentication"
+            );
+            Metrics::get().proxy.requests_auth_rate_limits_total.inc();
+            Metrics::get()
+                .proxy
+                .endpoints_auth_rate_limits
+                .get_metric()
+                .measure(endpoint);
+
+            if self.rate_limiter_enabled {
+                return Err(auth::AuthError::too_many_connections());
+            }
+        }
+
+        Ok(secret)
+    }
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -206,7 +284,7 @@ async fn auth_quirks(
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> auth::Result<ComputeCredentials> {
+) -> auth::Result<(ComputeCredentials, Option<Vec<IpPattern>>)> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
@@ -222,27 +300,55 @@ async fn auth_quirks(

    debug!("fetching authentication info and allowlists");

-    let access_controls = api
-        .get_endpoint_access_control(ctx, &info.endpoint, &info.user)
-        .await?;
+    // check allowed list
+    let allowed_ips = if config.ip_allowlist_check_enabled {
+        let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+        allowed_ips
+    } else {
+        Cached::new_uncached(Arc::new(vec![]))
+    };

-    access_controls.check(
-        ctx,
-        config.ip_allowlist_check_enabled,
-        config.is_vpc_acccess_proxy,
-    )?;
+    // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+    let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?;
+    if config.is_vpc_acccess_proxy {
+        if access_blocks.vpc_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }

-    let endpoint = EndpointIdInt::from(&info.endpoint);
-    let rate_limit_config = None;
-    if !endpoint_rate_limiter.check(endpoint, rate_limit_config, 1) {
+        let incoming_vpc_endpoint_id = match ctx.extra() {
+            None => return Err(AuthError::MissingEndpointName),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
+            Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+        };
+        let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
+        // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+        if !allowed_vpc_endpoint_ids.is_empty()
+            && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+        {
+            return Err(AuthError::vpc_endpoint_id_not_allowed(
+                incoming_vpc_endpoint_id,
+            ));
+        }
+    } else if access_blocks.public_access_blocked {
+        return Err(AuthError::NetworkNotAllowed);
+    }
+
+    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
        return Err(AuthError::too_many_connections());
    }
-    let role_access = api
-        .get_role_access_control(ctx, &info.endpoint, &info.user)
-        .await?;
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
+    let (cached_entry, secret) = cached_secret.take_value();

-    let secret = if let Some(secret) = role_access.secret {
-        secret
+    let secret = if let Some(secret) = secret {
+        config.check_rate_limit(
+            ctx,
+            secret,
+            &info.endpoint,
+            unauthenticated_password.is_some() || allow_cleartext,
+        )?
    } else {
        // If we don't have an authentication secret, we mock one to
        // prevent malicious probing (possible due to missing protocol steps).
@@ -262,8 +368,14 @@ async fn auth_quirks(
    )
    .await
    {
-        Ok(keys) => Ok(keys),
-        Err(e) => Err(e),
+        Ok(keys) => Ok((keys, Some(allowed_ips.as_ref().clone()))),
+        Err(e) => {
+            if e.is_password_failed() {
+                // The password could have been changed, so we invalidate the cache.
+                cached_entry.invalidate();
+            }
+            Err(e)
+        }
    }
 }

@@ -290,7 +402,7 @@ async fn authenticate_with_secret(
        };

        // we have authenticated the password
-        client.write_message(BeMessage::AuthenticationOk);
+        client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;

        return Ok(ComputeCredentials { info, keys });
    }
@@ -326,7 +438,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<Backend<'a, ComputeCredentials>> {
+    ) -> auth::Result<(Backend<'a, ComputeCredentials>, Option<Vec<IpPattern>>)> {
        let res = match self {
            Self::ControlPlane(api, user_info) => {
                debug!(
@@ -335,35 +447,17 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
                    "performing authentication using the console"
                );

-                let auth_res = auth_quirks(
+                let (credentials, ip_allowlist) = auth_quirks(
                    ctx,
                    &*api,
-                    user_info.clone(),
+                    user_info,
                    client,
                    allow_cleartext,
                    config,
                    endpoint_rate_limiter,
                )
-                .await;
-                match auth_res {
-                    Ok(credentials) => Ok(Backend::ControlPlane(api, credentials)),
-                    Err(e) => {
-                        // The password could have been changed, so we invalidate the cache.
-                        // We should only invalidate the cache if the TTL might have expired.
-                        if e.is_password_failed() {
-                            #[allow(irrefutable_let_patterns)]
-                            if let ControlPlaneClient::ProxyV1(api) = &*api {
-                                if let Some(ep) = &user_info.endpoint_id {
-                                    api.caches
-                                        .project_info
-                                        .maybe_invalidate_role_secret(ep, &user_info.user);
-                                }
-                            }
-                        }
-
-                        Err(e)
-                    }
-                }
+                .await?;
+                Ok((Backend::ControlPlane(api, credentials), ip_allowlist))
            }
            Self::Local(_) => {
                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"));
@@ -380,30 +474,44 @@ impl Backend<'_, ComputeUserInfo> {
    pub(crate) async fn get_role_secret(
        &self,
        ctx: &RequestContext,
-    ) -> Result<RoleAccessControl, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        match self {
-            Self::ControlPlane(api, user_info) => {
-                api.get_role_access_control(ctx, &user_info.endpoint, &user_info.user)
-                    .await
-            }
-            Self::Local(_) => Ok(RoleAccessControl { secret: None }),
+            Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(None)),
        }
    }

-    pub(crate) async fn get_endpoint_access_control(
+    pub(crate) async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
-    ) -> Result<EndpointAccessControl, GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }
+
+    pub(crate) async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
        match self {
            Self::ControlPlane(api, user_info) => {
-                api.get_endpoint_access_control(ctx, &user_info.endpoint, &user_info.user)
-                    .await
+                api.get_allowed_vpc_endpoint_ids(ctx, user_info).await
            }
-            Self::Local(_) => Ok(EndpointAccessControl {
-                allowed_ips: Arc::new(vec![]),
-                allowed_vpce: Arc::new(vec![]),
-                flags: AccessBlockerFlags::default(),
-            }),
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }
+
+    pub(crate) async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_block_public_or_vpc_access(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())),
        }
    }
 }
@@ -432,7 +540,9 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
 mod tests {
    #![allow(clippy::unimplemented, clippy::unwrap_used)]

+    use std::net::IpAddr;
    use std::sync::Arc;
+    use std::time::Duration;

    use bytes::BytesMut;
    use control_plane::AuthSecret;
@@ -443,16 +553,18 @@ mod tests {
    use postgres_protocol::message::frontend;
    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};

-    use super::auth_quirks;
    use super::jwt::JwkCache;
+    use super::{AuthRateLimiter, auth_quirks};
+    use crate::auth::backend::MaskedIp;
    use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
    use crate::config::AuthenticationConfig;
    use crate::context::RequestContext;
    use crate::control_plane::{
-        self, AccessBlockerFlags, CachedNodeInfo, EndpointAccessControl, RoleAccessControl,
+        self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps,
+        CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret,
    };
    use crate::proxy::NeonOptions;
-    use crate::rate_limiter::EndpointRateLimiter;
+    use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
    use crate::scram::ServerSecret;
    use crate::scram::threadpool::ThreadPool;
    use crate::stream::{PqStream, Stream};
@@ -465,34 +577,46 @@ mod tests {
    }

    impl control_plane::ControlPlaneApi for Auth {
-        async fn get_role_access_control(
+        async fn get_role_secret(
            &self,
            _ctx: &RequestContext,
-            _endpoint: &crate::types::EndpointId,
-            _role: &crate::types::RoleName,
-        ) -> Result<RoleAccessControl, control_plane::errors::GetAuthInfoError> {
-            Ok(RoleAccessControl {
-                secret: Some(self.secret.clone()),
-            })
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedRoleSecret, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
        }

-        async fn get_endpoint_access_control(
+        async fn get_allowed_ips(
            &self,
            _ctx: &RequestContext,
-            _endpoint: &crate::types::EndpointId,
-            _role: &crate::types::RoleName,
-        ) -> Result<EndpointAccessControl, control_plane::errors::GetAuthInfoError> {
-            Ok(EndpointAccessControl {
-                allowed_ips: Arc::new(self.ips.clone()),
-                allowed_vpce: Arc::new(self.vpc_endpoint_ids.clone()),
-                flags: self.access_blocker_flags,
-            })
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())))
+        }
+
+        async fn get_allowed_vpc_endpoint_ids(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new(
+                self.vpc_endpoint_ids.clone(),
+            )))
+        }
+
+        async fn get_block_public_or_vpc_access(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAccessBlockerFlags::new_uncached(
+                self.access_blocker_flags.clone(),
+            ))
        }

        async fn get_endpoint_jwks(
            &self,
            _ctx: &RequestContext,
-            _endpoint: &crate::types::EndpointId,
+            _endpoint: crate::types::EndpointId,
        ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
        {
            unimplemented!()
@@ -511,6 +635,9 @@ mod tests {
        jwks_cache: JwkCache::default(),
        thread_pool: ThreadPool::new(1),
        scram_protocol_timeout: std::time::Duration::from_secs(5),
+        rate_limiter_enabled: true,
+        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+        rate_limit_ip_subnet: 64,
        ip_allowlist_check_enabled: true,
        is_vpc_acccess_proxy: false,
        is_auth_broker: false,
@@ -527,10 +654,55 @@ mod tests {
        }
    }

+    #[test]
+    fn masked_ip() {
+        let ip_a = IpAddr::V4([127, 0, 0, 1].into());
+        let ip_b = IpAddr::V4([127, 0, 0, 2].into());
+        let ip_c = IpAddr::V4([192, 168, 1, 101].into());
+        let ip_d = IpAddr::V4([192, 168, 1, 102].into());
+        let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
+        let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
+
+        assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
+        assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
+        assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
+        assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
+
+        assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
+        assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
+    }
+
+    #[test]
+    fn test_default_auth_rate_limit_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 1000 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 600 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 300 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
+
    #[tokio::test]
    async fn auth_quirks_scram() {
        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server));
+        let mut stream = PqStream::new(Stream::from_raw(server));

        let ctx = RequestContext::test();
        let api = Auth {
@@ -612,7 +784,7 @@ mod tests {
    #[tokio::test]
    async fn auth_quirks_cleartext() {
        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server));
+        let mut stream = PqStream::new(Stream::from_raw(server));

        let ctx = RequestContext::test();
        let api = Auth {
@@ -666,7 +838,7 @@ mod tests {
    #[tokio::test]
    async fn auth_quirks_password_hack() {
        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server));
+        let mut stream = PqStream::new(Stream::from_raw(server));

        let ctx = RequestContext::test();
        let api = Auth {
@@ -715,7 +887,7 @@ mod tests {
        .await
        .unwrap();

-        assert_eq!(creds.info.endpoint, "my-endpoint");
+        assert_eq!(creds.0.info.endpoint, "my-endpoint");

        handle.await.unwrap();
    }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -5,6 +5,7 @@ use std::net::IpAddr;
 use std::str::FromStr;

 use itertools::Itertools;
+use pq_proto::StartupMessageParams;
 use thiserror::Error;
 use tracing::{debug, warn};

@@ -12,7 +13,6 @@ use crate::auth::password_hack::parse_endpoint_param;
 use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, SniGroup, SniKind};
-use crate::pqproto::StartupMessageParams;
 use crate::proxy::NeonOptions;
 use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI};
 use crate::types::{EndpointId, RoleName};
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,8 +1,10 @@
 //! Main authentication flow.

+use std::io;
 use std::sync::Arc;

 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
+use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;

@@ -11,26 +13,35 @@ use super::{AuthError, PasswordHackPayload};
 use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
-use crate::pqproto::{BeAuthenticationSaslMessage, BeMessage};
 use crate::sasl;
 use crate::scram::threadpool::ThreadPool;
 use crate::scram::{self};
 use crate::stream::{PqStream, Stream};
 use crate::tls::TlsServerEndPoint;

+/// Every authentication selector is supposed to implement this trait.
+pub(crate) trait AuthMethod {
+    /// Any authentication selector should provide initial backend message
+    /// containing auth method name and parameters, e.g. md5 salt.
+    fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
+}
+
+/// Initial state of [`AuthFlow`].
+pub(crate) struct Begin;
+
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
 pub(crate) struct Scram<'a>(
    pub(crate) &'a scram::ServerSecret,
    pub(crate) &'a RequestContext,
 );

-impl Scram<'_> {
+impl AuthMethod for Scram<'_> {
    #[inline(always)]
    fn first_message(&self, channel_binding: bool) -> BeMessage<'_> {
        if channel_binding {
-            BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
+            Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
        } else {
-            BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(
+            Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(
                scram::METHODS_WITHOUT_PLUS,
            ))
        }
@@ -41,6 +52,13 @@ impl Scram<'_> {
 /// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
 pub(crate) struct PasswordHack;

+impl AuthMethod for PasswordHack {
+    #[inline(always)]
+    fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
 pub(crate) struct CleartextPassword {
@@ -49,37 +67,53 @@ pub(crate) struct CleartextPassword {
    pub(crate) secret: AuthSecret,
 }

+impl AuthMethod for CleartextPassword {
+    #[inline(always)]
+    fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub(crate) struct AuthFlow<'a, S, State> {
    /// The underlying stream which implements libpq's protocol.
    stream: &'a mut PqStream<Stream<S>>,
-    /// State might contain ancillary data.
+    /// State might contain ancillary data (see [`Self::begin`]).
    state: State,
    tls_server_end_point: TlsServerEndPoint,
 }

 /// Initial state of the stream wrapper.
-impl<'a, S: AsyncRead + AsyncWrite + Unpin, M> AuthFlow<'a, S, M> {
+impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
    /// Create a new wrapper for client authentication.
-    pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>, method: M) -> Self {
+    pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
        let tls_server_end_point = stream.get_ref().tls_server_end_point();

        Self {
            stream,
-            state: method,
+            state: Begin,
            tls_server_end_point,
        }
    }
+
+    /// Move to the next step by sending auth method's name & params to client.
+    pub(crate) async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
+        self.stream
+            .write_message(&method.first_message(self.tls_server_end_point.supported()))
+            .await?;
+
+        Ok(AuthFlow {
+            stream: self.stream,
+            state: method,
+            tls_server_end_point: self.tls_server_end_point,
+        })
+    }
 }

 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
    /// Perform user authentication. Raise an error in case authentication failed.
    pub(crate) async fn get_password(self) -> super::Result<PasswordHackPayload> {
-        self.stream
-            .write_message(BeMessage::AuthenticationCleartextPassword);
-        self.stream.flush().await?;
-
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
@@ -99,10 +133,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
    /// Perform user authentication. Raise an error in case authentication failed.
    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
-        self.stream
-            .write_message(BeMessage::AuthenticationCleartextPassword);
-        self.stream.flush().await?;
-
        let msg = self.stream.read_password_message().await?;
        let password = msg
            .strip_suffix(&[0])
@@ -117,7 +147,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
        .await?;

        if let sasl::Outcome::Success(_) = &outcome {
-            self.stream.write_message(BeMessage::AuthenticationOk);
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
        }

        Ok(outcome)
@@ -129,36 +159,42 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
    /// Perform user authentication. Raise an error in case authentication failed.
    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
        let Scram(secret, ctx) = self.state;
-        let channel_binding = self.tls_server_end_point;

-        // send sasl message.
-        {
-            // pause the timer while we communicate with the client
-            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+        // pause the timer while we communicate with the client
+        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);

-            let sasl = self.state.first_message(channel_binding.supported());
-            self.stream.write_message(sasl);
-            self.stream.flush().await?;
+        // Initial client message contains the chosen auth method's name.
+        let msg = self.stream.read_password_message().await?;
+        let sasl = sasl::FirstMessage::parse(&msg)
+            .ok_or(AuthError::MalformedPassword("bad sasl message"))?;
+
+        // Currently, the only supported SASL method is SCRAM.
+        if !scram::METHODS.contains(&sasl.method) {
+            return Err(super::AuthError::bad_auth_method(sasl.method));
        }

-        // complete sasl handshake.
-        sasl::authenticate(ctx, self.stream, |method| {
-            // Currently, the only supported SASL method is SCRAM.
-            match method {
-                SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
-                SCRAM_SHA_256_PLUS => {
-                    ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus);
-                }
-                method => return Err(sasl::Error::BadAuthMethod(method.into())),
-            }
+        match sasl.method {
+            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
+            _ => {}
+        }

-            // TODO: make this a metric instead
-            info!("client chooses {}", method);
+        // TODO: make this a metric instead
+        info!("client chooses {}", sasl.method);

-            Ok(scram::Exchange::new(secret, rand::random, channel_binding))
-        })
-        .await
-        .map_err(AuthError::Sasl)
+        let outcome = sasl::SaslStream::new(self.stream, sasl.message)
+            .authenticate(scram::Exchange::new(
+                secret,
+                rand::random,
+                self.tls_server_end_point,
+            ))
+            .await?;
+
+        if let sasl::Outcome::Success(_) = &outcome {
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        Ok(outcome)
    }
 }

--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -32,7 +32,9 @@ use crate::ext::TaskExt;
 use crate::http::health_server::AppMetrics;
 use crate::intern::RoleNameInt;
 use crate::metrics::{Metrics, ThreadPoolMetrics};
-use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo};
+use crate::rate_limiter::{
+    BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
+};
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::{self, GlobalConnPoolOptions};
@@ -67,6 +69,15 @@ struct LocalProxyCliArgs {
    /// Can be given multiple times for different bucket sizes.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
    /// Whether to retry the connection to the compute node
    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
    connect_to_compute_retry: String,
@@ -271,6 +282,9 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
            jwks_cache: JwkCache::default(),
            thread_pool: ThreadPool::new(0),
            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
            ip_allowlist_check_enabled: true,
            is_vpc_acccess_proxy: false,
            is_auth_broker: false,
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -4,9 +4,8 @@
 //! This allows connecting to pods/services running in the same Kubernetes cluster from
 //! the outside. Similar to an ingress controller for HTTPS.

-use std::net::SocketAddr;
 use std::path::Path;
-use std::sync::Arc;
+use std::{net::SocketAddr, sync::Arc};

 use anyhow::{Context, anyhow, bail, ensure};
 use clap::Arg;
@@ -18,7 +17,6 @@ use rustls::pki_types::{DnsName, PrivateKeyDer};
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio::net::TcpListener;
 use tokio_rustls::TlsConnector;
-use tokio_rustls::server::TlsStream;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, error, info};
 use utils::project_git_version;
@@ -26,12 +24,10 @@ use utils::sentry_init::init_sentry;

 use crate::context::RequestContext;
 use crate::metrics::{Metrics, ThreadPoolMetrics};
-use crate::pqproto::FeStartupPacket;
 use crate::protocol2::ConnectionInfo;
-use crate::proxy::{
-    ErrorSource, TlsRequired, copy_bidirectional_client_compute, run_until_cancelled,
-};
+use crate::proxy::{ErrorSource, copy_bidirectional_client_compute, run_until_cancelled};
 use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;

 project_git_version!(GIT_VERSION);

@@ -88,7 +84,7 @@ pub async fn run() -> anyhow::Result<()> {
        .parse()?;

    // Configure TLS
-    let tls_config = match (
+    let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
        args.get_one::<String>("tls-key"),
        args.get_one::<String>("tls-cert"),
    ) {
@@ -121,6 +117,7 @@ pub async fn run() -> anyhow::Result<()> {
        dest.clone(),
        tls_config.clone(),
        None,
+        tls_server_end_point,
        proxy_listener,
        cancellation_token.clone(),
    ))
@@ -130,6 +127,7 @@ pub async fn run() -> anyhow::Result<()> {
        dest,
        tls_config,
        Some(compute_tls_config),
+        tls_server_end_point,
        proxy_listener_compute_tls,
        cancellation_token.clone(),
    ))
@@ -156,7 +154,7 @@ pub async fn run() -> anyhow::Result<()> {
 pub(super) fn parse_tls(
    key_path: &Path,
    cert_path: &Path,
-) -> anyhow::Result<Arc<rustls::ServerConfig>> {
+) -> anyhow::Result<(Arc<rustls::ServerConfig>, TlsServerEndPoint)> {
    let key = {
        let key_bytes = std::fs::read(key_path).context("TLS key file")?;

@@ -189,6 +187,10 @@ pub(super) fn parse_tls(
            })?
    };

+    // needed for channel bindings
+    let first_cert = cert_chain.first().context("missing certificate")?;
+    let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
    let tls_config =
        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
@@ -197,13 +199,14 @@ pub(super) fn parse_tls(
            .with_single_cert(cert_chain, key)?
            .into();

-    Ok(tls_config)
+    Ok((tls_config, tls_server_end_point))
 }

 pub(super) async fn task_main(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    compute_tls_config: Option<Arc<rustls::ClientConfig>>,
+    tls_server_end_point: TlsServerEndPoint,
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
 ) -> anyhow::Result<()> {
@@ -239,7 +242,15 @@ pub(super) async fn task_main(
                    crate::metrics::Protocol::SniRouter,
                    "sni",
                );
-                handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await
+                handle_client(
+                    ctx,
+                    dest_suffix,
+                    tls_config,
+                    compute_tls_config,
+                    tls_server_end_point,
+                    socket,
+                )
+                .await
            }
            .unwrap_or_else(|e| {
                // Acknowledge that the task has finished with an error.
@@ -258,26 +269,55 @@ pub(super) async fn task_main(
    Ok(())
 }

+const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    ctx: &RequestContext,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
-) -> anyhow::Result<TlsStream<S>> {
-    let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)).await?;
-    match msg {
-        FeStartupPacket::SslRequest { direct: None } => {
-            let raw = stream.accept_tls().await?;
+    tls_server_end_point: TlsServerEndPoint,
+) -> anyhow::Result<Stream<S>> {
+    let mut stream = PqStream::new(Stream::from_raw(raw_stream));

-            Ok(raw
-                .upgrade(tls_config, !ctx.has_private_peer_addr())
-                .await?)
+    let msg = stream.read_startup_packet().await?;
+    use pq_proto::FeStartupPacket::SslRequest;
+
+    match msg {
+        SslRequest { direct: false } => {
+            stream
+                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
+                .await?;
+
+            // Upgrade raw stream into a secure TLS-backed stream.
+            // NOTE: We've consumed `tls`; this fact will be used later.
+
+            let (raw, read_buf) = stream.into_inner();
+            // TODO: Normally, client doesn't send any data before
+            // server says TLS handshake is ok and read_buf is empty.
+            // However, you could imagine pipelining of postgres
+            // SSLRequest + TLS ClientHello in one hunk similar to
+            // pipelining in our node js driver. We should probably
+            // support that by chaining read_buf with the stream.
+            if !read_buf.is_empty() {
+                bail!("data is sent before server replied with EncryptionResponse");
+            }
+
+            Ok(Stream::Tls {
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
+                tls_server_end_point,
+            })
        }
        unexpected => {
            info!(
                ?unexpected,
                "unexpected startup packet, rejecting connection"
            );
-            Err(stream.throw_error(TlsRequired, None).await)?
+            stream
+                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User, None)
+                .await?
        }
    }
 }
@@ -287,18 +327,15 @@ async fn handle_client(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    compute_tls_config: Option<Arc<rustls::ClientConfig>>,
+    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config).await?;
+    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
    //   `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
-    let sni = tls_stream
-        .get_ref()
-        .1
-        .server_name()
-        .ok_or(anyhow!("SNI missing"))?;
+    let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
    let dest: Vec<&str> = sni
        .split_once('.')
        .context("invalid SNI")?
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -20,7 +20,7 @@ use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version};

 use crate::auth::backend::jwt::JwkCache;
-use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
+use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
 use crate::cancellation::{CancellationHandler, handle_cancel_messages};
 use crate::config::{
    self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
@@ -29,7 +29,9 @@ use crate::config::{
 use crate::context::parquet::ParquetUploadArgs;
 use crate::http::health_server::AppMetrics;
 use crate::metrics::Metrics;
-use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
+use crate::rate_limiter::{
+    EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
+};
 use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::redis::kv_ops::RedisKVClient;
 use crate::redis::{elasticache, notifications};
@@ -152,6 +154,15 @@ struct ProxyCliArgs {
    /// Wake compute rate limiter max number of requests per second.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
    wake_compute_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
    /// Redis rate limiter max number of requests per second.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
@@ -399,9 +410,22 @@ pub async fn run() -> anyhow::Result<()> {
        Some(tx_cancel),
    ));

+    // bit of a hack - find the min rps and max rps supported and turn it into
+    // leaky bucket config instead
+    let max = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .max_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.max);
+    let rps = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .min_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.rps);
    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-        RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
-            .unwrap_or(EndpointRateLimiter::DEFAULT),
+        LeakyBucketConfig { rps, max },
        64,
    ));

@@ -452,7 +476,8 @@ pub async fn run() -> anyhow::Result<()> {
        let key_path = args.tls_key.expect("already asserted it is set");
        let cert_path = args.tls_cert.expect("already asserted it is set");

-        let tls_config = super::pg_sni_router::parse_tls(&key_path, &cert_path)?;
+        let (tls_config, tls_server_end_point) =
+            super::pg_sni_router::parse_tls(&key_path, &cert_path)?;

        let dest = Arc::new(dest);

@@ -460,6 +485,7 @@ pub async fn run() -> anyhow::Result<()> {
            dest.clone(),
            tls_config.clone(),
            None,
+            tls_server_end_point,
            listen,
            cancellation_token.clone(),
        ));
@@ -468,6 +494,7 @@ pub async fn run() -> anyhow::Result<()> {
            dest,
            tls_config,
            Some(config.connect_to_compute.tls.clone()),
+            tls_server_end_point,
            listen_tls,
            cancellation_token.clone(),
        ));
@@ -654,6 +681,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        jwks_cache: JwkCache::default(),
        thread_pool,
        scram_protocol_timeout: args.scram_protocol_timeout,
+        rate_limiter_enabled: args.auth_rate_limit_enabled,
+        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
        ip_allowlist_check_enabled: !args.is_private_access_proxy,
        is_vpc_acccess_proxy: args.is_private_access_proxy,
        is_auth_broker: args.is_auth_broker,
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,25 +1,30 @@
-use std::collections::{HashMap, HashSet, hash_map};
+use std::collections::HashSet;
 use std::convert::Infallible;
+use std::sync::Arc;
 use std::sync::atomic::AtomicU64;
 use std::time::Duration;

 use async_trait::async_trait;
 use clashmap::ClashMap;
-use clashmap::mapref::one::Ref;
 use rand::{Rng, thread_rng};
+use smol_str::SmolStr;
 use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};

+use super::{Cache, Cached};
+use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
-use crate::control_plane::{EndpointAccessControl, RoleAccessControl};
+use crate::control_plane::{AccessBlockerFlags, AuthSecret};
 use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};

 #[async_trait]
 pub(crate) trait ProjectInfoCache {
-    fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt);
-    fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
    async fn decrement_active_listeners(&self);
    async fn increment_active_listeners(&self);
@@ -37,10 +42,6 @@ impl<T> Entry<T> {
            value,
        }
    }
-
-    pub(crate) fn get(&self, valid_since: Instant) -> Option<&T> {
-        (valid_since < self.created_at).then_some(&self.value)
-    }
 }

 impl<T> From<T> for Entry<T> {
@@ -49,32 +50,101 @@ impl<T> From<T> for Entry<T> {
    }
 }

+#[derive(Default)]
 struct EndpointInfo {
-    role_controls: HashMap<RoleNameInt, Entry<RoleAccessControl>>,
-    controls: Option<Entry<EndpointAccessControl>>,
+    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
+    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
+    block_public_or_vpc_access: Option<Entry<AccessBlockerFlags>>,
+    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }

 impl EndpointInfo {
+    fn check_ignore_cache(ignore_cache_since: Option<Instant>, created_at: Instant) -> bool {
+        match ignore_cache_since {
+            None => false,
+            Some(t) => t < created_at,
+        }
+    }
    pub(crate) fn get_role_secret(
        &self,
        role_name: RoleNameInt,
        valid_since: Instant,
-    ) -> Option<RoleAccessControl> {
-        let controls = self.role_controls.get(&role_name)?;
-        controls.get(valid_since).cloned()
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Option<AuthSecret>, bool)> {
+        if let Some(secret) = self.secret.get(&role_name) {
+            if valid_since < secret.created_at {
+                return Some((
+                    secret.value.clone(),
+                    Self::check_ignore_cache(ignore_cache_since, secret.created_at),
+                ));
+            }
+        }
+        None
    }

-    pub(crate) fn get_controls(&self, valid_since: Instant) -> Option<EndpointAccessControl> {
-        let controls = self.controls.as_ref()?;
-        controls.get(valid_since).cloned()
+    pub(crate) fn get_allowed_ips(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<IpPattern>>, bool)> {
+        if let Some(allowed_ips) = &self.allowed_ips {
+            if valid_since < allowed_ips.created_at {
+                return Some((
+                    allowed_ips.value.clone(),
+                    Self::check_ignore_cache(ignore_cache_since, allowed_ips.created_at),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<String>>, bool)> {
+        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
+            if valid_since < allowed_vpc_endpoint_ids.created_at {
+                return Some((
+                    allowed_vpc_endpoint_ids.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        allowed_vpc_endpoint_ids.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(AccessBlockerFlags, bool)> {
+        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
+            if valid_since < block_public_or_vpc_access.created_at {
+                return Some((
+                    block_public_or_vpc_access.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        block_public_or_vpc_access.created_at,
+                    ),
+                ));
+            }
+        }
+        None
    }

-    pub(crate) fn invalidate_endpoint(&mut self) {
-        self.controls = None;
+    pub(crate) fn invalidate_allowed_ips(&mut self) {
+        self.allowed_ips = None;
+    }
+    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
+        self.allowed_vpc_endpoint_ids = None;
+    }
+    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
+        self.block_public_or_vpc_access = None;
    }
-
    pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
-        self.role_controls.remove(&role_name);
+        self.secret.remove(&role_name);
    }
 }

@@ -100,22 +170,34 @@ pub struct ProjectInfoCacheImpl {

 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt) {
-        info!("invalidating endpoint access for project `{project_id}`");
-        let endpoints = self
-            .project2ep
-            .get(&project_id)
-            .map(|kv| kv.value().clone())
-            .unwrap_or_default();
-        for endpoint_id in endpoints {
-            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_endpoint();
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
+        info!(
+            "invalidating allowed vpc endpoint ids for projects `{}`",
+            project_ids
+                .iter()
+                .map(|id| id.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        );
+        for project_id in project_ids {
+            let endpoints = self
+                .project2ep
+                .get(&project_id)
+                .map(|kv| kv.value().clone())
+                .unwrap_or_default();
+            for endpoint_id in endpoints {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
            }
        }
    }

-    fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt) {
-        info!("invalidating endpoint access for org `{account_id}`");
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
+        info!(
+            "invalidating allowed vpc endpoint ids for org `{}`",
+            account_id
+        );
        let endpoints = self
            .account2ep
            .get(&account_id)
@@ -123,11 +205,41 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
            .unwrap_or_default();
        for endpoint_id in endpoints {
            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_endpoint();
+                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
            }
        }
    }

+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
+        info!(
+            "invalidating block public or vpc access for project `{}`",
+            project_id
+        );
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_block_public_or_vpc_access();
+            }
+        }
+    }
+
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
+        info!("invalidating allowed ips for project `{}`", project_id);
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_allowed_ips();
+            }
+        }
+    }
    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) {
        info!(
            "invalidating role secret for project_id `{}` and role_name `{}`",
@@ -144,7 +256,6 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
            }
        }
    }
-
    async fn decrement_active_listeners(&self) {
        let mut listeners_guard = self.active_listeners_lock.lock().await;
        if *listeners_guard == 0 {
@@ -182,72 +293,156 @@ impl ProjectInfoCacheImpl {
        }
    }

-    fn get_endpoint_cache(
-        &self,
-        endpoint_id: &EndpointId,
-    ) -> Option<Ref<'_, EndpointIdInt, EndpointInfo>> {
-        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
-        self.cache.get(&endpoint_id)
-    }
-
    pub(crate) fn get_role_secret(
        &self,
        endpoint_id: &EndpointId,
        role_name: &RoleName,
-    ) -> Option<RoleAccessControl> {
-        let valid_since = self.get_cache_times();
+    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
        let role_name = RoleNameInt::get(role_name)?;
-        let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_role_secret(role_name, valid_since)
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let (value, ignore_cache) =
+            endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_role_secret(endpoint_id, role_name),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
    }
-
-    pub(crate) fn get_endpoint_access(
+    pub(crate) fn get_allowed_ips(
        &self,
        endpoint_id: &EndpointId,
-    ) -> Option<EndpointAccessControl> {
-        let valid_since = self.get_cache_times();
-        let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_controls(valid_since)
+    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<String>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, AccessBlockerFlags>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
    }

-    pub(crate) fn insert_endpoint_access(
+    pub(crate) fn insert_role_secret(
        &self,
-        account_id: Option<AccountIdInt>,
        project_id: ProjectIdInt,
        endpoint_id: EndpointIdInt,
        role_name: RoleNameInt,
-        controls: EndpointAccessControl,
-        role_controls: RoleAccessControl,
+        secret: Option<AuthSecret>,
    ) {
-        if let Some(account_id) = account_id {
-            self.insert_account2endpoint(account_id, endpoint_id);
-        }
-        self.insert_project2endpoint(project_id, endpoint_id);
-
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
            return;
        }
-
-        let controls = Entry::from(controls);
-        let role_controls = Entry::from(role_controls);
-
-        match self.cache.entry(endpoint_id) {
-            clashmap::Entry::Vacant(e) => {
-                e.insert(EndpointInfo {
-                    role_controls: HashMap::from_iter([(role_name, role_controls)]),
-                    controls: Some(controls),
-                });
-            }
-            clashmap::Entry::Occupied(mut e) => {
-                let ep = e.get_mut();
-                ep.controls = Some(controls);
-                if ep.role_controls.len() < self.config.max_roles {
-                    ep.role_controls.insert(role_name, role_controls);
-                }
-            }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        let mut entry = self.cache.entry(endpoint_id).or_default();
+        if entry.secret.len() < self.config.max_roles {
+            entry.secret.insert(role_name, secret.into());
        }
    }
+    pub(crate) fn insert_allowed_ips(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_ips: Arc<Vec<IpPattern>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
+    }
+    pub(crate) fn insert_allowed_vpc_endpoint_ids(
+        &self,
+        account_id: Option<AccountIdInt>,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_vpc_endpoint_ids: Arc<Vec<String>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        if let Some(account_id) = account_id {
+            self.insert_account2endpoint(account_id, endpoint_id);
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into());
+    }
+    pub(crate) fn insert_block_public_or_vpc_access(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        access_blockers: AccessBlockerFlags,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .block_public_or_vpc_access = Some(access_blockers.into());
+    }

    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
@@ -257,7 +452,6 @@ impl ProjectInfoCacheImpl {
                .insert(project_id, HashSet::from([endpoint_id]));
        }
    }
-
    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
            endpoints.insert(endpoint_id);
@@ -266,57 +460,21 @@ impl ProjectInfoCacheImpl {
                .insert(account_id, HashSet::from([endpoint_id]));
        }
    }
-
-    fn ignore_ttl_since(&self) -> Option<Instant> {
+    fn get_cache_times(&self) -> (Instant, Option<Instant>) {
+        let mut valid_since = Instant::now() - self.config.ttl;
+        // Only ignore cache if ttl is disabled.
        let ttl_disabled_since_us = self
            .ttl_disabled_since_us
            .load(std::sync::atomic::Ordering::Relaxed);
-
-        if ttl_disabled_since_us == u64::MAX {
-            return None;
-        }
-
-        Some(self.start_time + Duration::from_micros(ttl_disabled_since_us))
-    }
-
-    fn get_cache_times(&self) -> Instant {
-        let mut valid_since = Instant::now() - self.config.ttl;
-        if let Some(ignore_ttl_since) = self.ignore_ttl_since() {
+        let ignore_cache_since = if ttl_disabled_since_us == u64::MAX {
+            None
+        } else {
+            let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
            // We are fine if entry is not older than ttl or was added before we are getting notifications.
-            valid_since = valid_since.min(ignore_ttl_since);
-        }
-        valid_since
-    }
-
-    pub fn maybe_invalidate_role_secret(&self, endpoint_id: &EndpointId, role_name: &RoleName) {
-        let Some(endpoint_id) = EndpointIdInt::get(endpoint_id) else {
-            return;
+            valid_since = valid_since.min(ignore_cache_since);
+            Some(ignore_cache_since)
        };
-        let Some(role_name) = RoleNameInt::get(role_name) else {
-            return;
-        };
-
-        let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) else {
-            return;
-        };
-
-        let entry = endpoint_info.role_controls.entry(role_name);
-        let hash_map::Entry::Occupied(role_controls) = entry else {
-            return;
-        };
-
-        let created_at = role_controls.get().created_at;
-        let expire = match self.ignore_ttl_since() {
-            // if ignoring TTL, we should still try and roll the password if it's old
-            // and we the client gave an incorrect password. There could be some lag on the redis channel.
-            Some(_) => created_at + self.config.ttl < Instant::now(),
-            // edge case: redis is down, let's be generous and invalidate the cache immediately.
-            None => true,
-        };
-
-        if expire {
-            role_controls.remove();
-        }
+        (valid_since, ignore_cache_since)
    }

    pub async fn gc_worker(&self) -> anyhow::Result<Infallible> {
@@ -351,12 +509,84 @@ impl ProjectInfoCacheImpl {
    }
 }

+/// Lookup info for project info cache.
+/// This is used to invalidate cache entries.
+pub(crate) struct CachedLookupInfo {
+    /// Search by this key.
+    endpoint_id: EndpointIdInt,
+    lookup_type: LookupType,
+}
+
+impl CachedLookupInfo {
+    pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::RoleSecret(role_name),
+        }
+    }
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedIps,
+        }
+    }
+    pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedVpcEndpointIds,
+        }
+    }
+    pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::BlockPublicOrVpcAccess,
+        }
+    }
+}
+
+enum LookupType {
+    RoleSecret(RoleNameInt),
+    AllowedIps,
+    AllowedVpcEndpointIds,
+    BlockPublicOrVpcAccess,
+}
+
+impl Cache for ProjectInfoCacheImpl {
+    type Key = SmolStr;
+    // Value is not really used here, but we need to specify it.
+    type Value = SmolStr;
+
+    type LookupInfo<Key> = CachedLookupInfo;
+
+    fn invalidate(&self, key: &Self::LookupInfo<SmolStr>) {
+        match &key.lookup_type {
+            LookupType::RoleSecret(role_name) => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_role_secret(*role_name);
+                }
+            }
+            LookupType::AllowedIps => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_ips();
+                }
+            }
+            LookupType::AllowedVpcEndpointIds => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+            LookupType::BlockPublicOrVpcAccess => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_block_public_or_vpc_access();
+                }
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
    use super::*;
-    use crate::control_plane::{AccessBlockerFlags, AuthSecret};
    use crate::scram::ServerSecret;
    use crate::types::ProjectId;

@@ -371,8 +601,6 @@ mod tests {
        });
        let project_id: ProjectId = "project".into();
        let endpoint_id: EndpointId = "endpoint".into();
-        let account_id: Option<AccountIdInt> = None;
-
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -381,73 +609,183 @@ mod tests {
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
        ]);
-
-        cache.insert_endpoint_access(
-            account_id,
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user1).into(),
-            EndpointAccessControl {
-                allowed_ips: allowed_ips.clone(),
-                allowed_vpce: Arc::new(vec![]),
-                flags: AccessBlockerFlags::default(),
-            },
-            RoleAccessControl {
-                secret: secret1.clone(),
-            },
+            secret1.clone(),
        );
-
-        cache.insert_endpoint_access(
-            account_id,
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user2).into(),
-            EndpointAccessControl {
-                allowed_ips: allowed_ips.clone(),
-                allowed_vpce: Arc::new(vec![]),
-                flags: AccessBlockerFlags::default(),
-            },
-            RoleAccessControl {
-                secret: secret2.clone(),
-            },
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
        );

        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert_eq!(cached.secret, secret1);
-
+        assert!(cached.cached());
+        assert_eq!(cached.value, secret1);
        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert_eq!(cached.secret, secret2);
+        assert!(cached.cached());
+        assert_eq!(cached.value, secret2);

        // Shouldn't add more than 2 roles.
        let user3: RoleName = "user3".into();
        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
-
-        cache.insert_endpoint_access(
-            account_id,
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user3).into(),
-            EndpointAccessControl {
-                allowed_ips: allowed_ips.clone(),
-                allowed_vpce: Arc::new(vec![]),
-                flags: AccessBlockerFlags::default(),
-            },
-            RoleAccessControl {
-                secret: secret3.clone(),
-            },
+            secret3.clone(),
        );
-
        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());

-        let cached = cache.get_endpoint_access(&endpoint_id).unwrap();
-        assert_eq!(cached.allowed_ips, allowed_ips);
+        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
+        assert!(cached.cached());
+        assert_eq!(cached.value, allowed_ips);

        tokio::time::advance(Duration::from_secs(2)).await;
        let cached = cache.get_role_secret(&endpoint_id, &user1);
        assert!(cached.is_none());
        let cached = cache.get_role_secret(&endpoint_id, &user2);
        assert!(cached.is_none());
-        let cached = cache.get_endpoint_access(&endpoint_id);
+        let cached = cache.get_allowed_ips(&endpoint_id);
        assert!(cached.is_none());
    }
+
+    #[tokio::test]
+    async fn test_project_info_cache_invalidations() {
+        tokio::time::pause();
+        let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
+            size: 2,
+            max_roles: 2,
+            ttl: Duration::from_secs(1),
+            gc_interval: Duration::from_secs(600),
+        }));
+        cache.clone().increment_active_listeners().await;
+        tokio::time::advance(Duration::from_secs(2)).await;
+
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
+
+        tokio::time::advance(Duration::from_secs(2)).await;
+        // Nothing should be invalidated.
+
+        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
+        // TTL is disabled, so it should be impossible to invalidate this value.
+        assert!(!cached.cached());
+        assert_eq!(cached.value, secret1);
+
+        cached.invalidate(); // Shouldn't do anything.
+        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
+        assert_eq!(cached.value, secret1);
+
+        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
+        assert!(!cached.cached());
+        assert_eq!(cached.value, secret2);
+
+        // The only way to invalidate this value is to invalidate via the api.
+        cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into());
+        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
+
+        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
+        assert!(!cached.cached());
+        assert_eq!(cached.value, allowed_ips);
+    }
+
+    #[tokio::test]
+    async fn test_increment_active_listeners_invalidate_added_before() {
+        tokio::time::pause();
+        let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
+            size: 2,
+            max_roles: 2,
+            ttl: Duration::from_secs(1),
+            gc_interval: Duration::from_secs(600),
+        }));
+
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.clone().increment_active_listeners().await;
+        tokio::time::advance(Duration::from_millis(100)).await;
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+
+        // Added before ttl was disabled + ttl should be still cached.
+        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
+        assert!(cached.cached());
+        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
+        assert!(cached.cached());
+
+        tokio::time::advance(Duration::from_secs(1)).await;
+        // Added before ttl was disabled + ttl should expire.
+        assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
+        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
+
+        // Added after ttl was disabled + ttl should not be cached.
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
+        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
+        assert!(!cached.cached());
+
+        tokio::time::advance(Duration::from_secs(1)).await;
+        // Added before ttl was disabled + ttl still should expire.
+        assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
+        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
+        // Shouldn't be invalidated.
+
+        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
+        assert!(!cached.cached());
+        assert_eq!(cached.value, allowed_ips);
+    }
 }
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -5,6 +5,7 @@ use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
+use pq_proto::CancelKeyData;
 use redis::{Cmd, FromRedisValue, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
@@ -12,15 +13,15 @@ use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, error, info, warn};

-use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
+use crate::auth::{AuthError, check_peer_addr_is_in_list};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
-use crate::pqproto::CancelKeyData;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
@@ -271,7 +272,13 @@ pub(crate) enum CancelError {
    #[error("rate limit exceeded")]
    RateLimit,

-    #[error("Authentication error")]
+    #[error("IP is not allowed")]
+    IpNotAllowed,
+
+    #[error("VPC endpoint id is not allowed to connect")]
+    VpcEndpointIdNotAllowed,
+
+    #[error("Authentication backend error")]
    AuthError(#[from] AuthError),

    #[error("key not found")]
@@ -290,7 +297,10 @@ impl ReportableError for CancelError {
            }
            CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
            CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
-            CancelError::NotFound | CancelError::AuthError(_) => crate::error::ErrorKind::User,
+            CancelError::IpNotAllowed
+            | CancelError::VpcEndpointIdNotAllowed
+            | CancelError::NotFound => crate::error::ErrorKind::User,
+            CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
            CancelError::InternalError => crate::error::ErrorKind::Service,
        }
    }
@@ -412,13 +422,7 @@ impl CancellationHandler {
            IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
            IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
        };
-
-        let allowed = {
-            let rate_limit_config = None;
-            let limiter = self.limiter.lock_propagate_poison();
-            limiter.check(subnet_key, rate_limit_config, 1)
-        };
-        if !allowed {
+        if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
            // log only the subnet part of the IP address to know which subnet is rate limited
            tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
            Metrics::get()
@@ -446,13 +450,52 @@ impl CancellationHandler {
            return Err(CancelError::NotFound);
        };

-        let info = &cancel_closure.user_info;
-        let access_controls = auth_backend
-            .get_endpoint_access_control(&ctx, &info.endpoint, &info.user)
+        if check_ip_allowed {
+            let ip_allowlist = auth_backend
+                .get_allowed_ips(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+
+            if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
+                // log it here since cancel_session could be spawned in a task
+                tracing::warn!(
+                    "IP is not allowed to cancel the query: {key}, address: {}",
+                    ctx.peer_addr()
+                );
+                return Err(CancelError::IpNotAllowed);
+            }
+        }
+
+        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+        let access_blocks = auth_backend
+            .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info)
            .await
            .map_err(|e| CancelError::AuthError(e.into()))?;

-        access_controls.check(&ctx, check_ip_allowed, check_vpc_allowed)?;
+        if check_vpc_allowed {
+            if access_blocks.vpc_access_blocked {
+                return Err(CancelError::AuthError(AuthError::NetworkNotAllowed));
+            }
+
+            let incoming_vpc_endpoint_id = match ctx.extra() {
+                None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            let allowed_vpc_endpoint_ids = auth_backend
+                .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+            {
+                return Err(CancelError::VpcEndpointIdNotAllowed);
+            }
+        } else if access_blocks.public_access_blocked {
+            return Err(CancelError::VpcEndpointIdNotAllowed);
+        }

        Metrics::get()
            .proxy
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,6 +8,7 @@ use itertools::Itertools;
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
+use pq_proto::StartupMessageParams;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::{TcpStream, lookup_host};
@@ -23,7 +24,6 @@ use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
-use crate::pqproto::StartupMessageParams;
 use crate::proxy::neon_option;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
 use crate::types::Host;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,6 +7,7 @@ use arc_swap::ArcSwapOption;
 use clap::ValueEnum;
 use remote_storage::RemoteStorageConfig;

+use crate::auth::backend::AuthRateLimiter;
 use crate::auth::backend::jwt::JwkCache;
 use crate::control_plane::locks::ApiLocks;
 use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
@@ -64,6 +65,9 @@ pub struct HttpConfig {
 pub struct AuthenticationConfig {
    pub thread_pool: Arc<ThreadPool>,
    pub scram_protocol_timeout: tokio::time::Duration,
+    pub rate_limiter_enabled: bool,
+    pub rate_limiter: AuthRateLimiter,
+    pub rate_limit_ip_subnet: u8,
    pub ip_allowlist_check_enabled: bool,
    pub is_vpc_acccess_proxy: bool,
    pub jwks_cache: JwkCache,
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;

 use futures::{FutureExt, TryFutureExt};
-use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info};

@@ -159,7 +159,7 @@ pub async fn task_main(
 }

 #[allow(clippy::too_many_arguments)]
-pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    backend: &'static ConsoleRedirectBackend,
    ctx: &RequestContext,
@@ -221,10 +221,12 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        .await
    {
        Ok(auth_result) => auth_result,
-        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
+        Err(e) => {
+            return stream.throw_error(e, Some(ctx)).await?;
+        }
    };

-    let node = connect_to_compute(
+    let mut node = connect_to_compute(
        ctx,
        &TcpMechanism {
            user_info,
@@ -236,7 +238,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
-    .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
    .await?;

    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
@@ -244,8 +246,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(

    session.write_cancel_key(node.cancel_closure.clone())?;

-    prepare_client_connection(&node, *session.key(), &mut stream);
-    let stream = stream.flush_and_into_inner().await?;
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
+
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;

    Ok(Some(ProxyPassthrough {
        client: stream,
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -4,6 +4,7 @@ use std::net::IpAddr;

 use chrono::Utc;
 use once_cell::sync::OnceCell;
+use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
@@ -19,7 +20,6 @@ use crate::metrics::{
    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
    Waiting,
 };
-use crate::pqproto::StartupMessageParams;
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};

@@ -370,18 +370,6 @@ impl RequestContext {
        }
    }

-    pub(crate) fn latency_timer_pause_at(
-        &self,
-        at: tokio::time::Instant,
-        waiting_for: Waiting,
-    ) -> LatencyTimerPause<'_> {
-        LatencyTimerPause {
-            ctx: self,
-            start: at,
-            waiting_for,
-        }
-    }
-
    pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated {
        self.0
            .try_lock()
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -11,6 +11,7 @@ use parquet::file::metadata::RowGroupMetaDataPtr;
 use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr};
 use parquet::file::writer::SerializedFileWriter;
 use parquet::record::RecordWriter;
+use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
 use tokio::sync::mpsc;
@@ -23,7 +24,6 @@ use super::{LOG_CHAN, RequestContextInner};
 use crate::config::remote_storage_from_toml;
 use crate::context::LOG_CHAN_DISCONNECT;
 use crate::ext::TaskExt;
-use crate::pqproto::StartupMessageParams;

 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -7,9 +7,7 @@ use std::time::Duration;

 use ::http::HeaderName;
 use ::http::header::AUTHORIZATION;
-use bytes::Bytes;
 use futures::TryFutureExt;
-use hyper::StatusCode;
 use postgres_client::config::SslMode;
 use tokio::time::Instant;
 use tracing::{Instrument, debug, info, info_span, warn};
@@ -17,6 +15,7 @@ use tracing::{Instrument, debug, info, info_span, warn};
 use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::AuthRule;
+use crate::cache::Cached;
 use crate::context::RequestContext;
 use crate::control_plane::caches::ApiCaches;
 use crate::control_plane::errors::{
@@ -25,12 +24,12 @@ use crate::control_plane::errors::{
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::control_plane::{
-    AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
-    RoleAccessControl,
+    AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo,
 };
-use crate::metrics::Metrics;
+use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId, RoleName};
+use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, http, scram};

 pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
@@ -67,41 +66,66 @@ impl NeonControlPlaneClient {
        self.endpoint.url().as_str()
    }

-    async fn do_get_auth_req(
+    async fn do_get_auth_info(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
+        user_info: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+        {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
+        self.do_get_auth_req(user_info, &ctx.session_id(), Some(ctx))
+            .await
+    }
+
+    async fn do_get_auth_req(
+        &self,
+        user_info: &ComputeUserInfo,
+        session_id: &uuid::Uuid,
+        ctx: Option<&RequestContext>,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        let request_id: String = session_id.to_string();
+        let application_name = if let Some(ctx) = ctx {
+            ctx.console_application_name()
+        } else {
+            "auth_cancellation".to_string()
+        };
+
        async {
-            let response = {
-                let request = self
-                    .endpoint
-                    .get_path("get_endpoint_access_control")
-                    .header(X_REQUEST_ID, ctx.session_id().to_string())
-                    .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                    .query(&[("session_id", ctx.session_id())])
-                    .query(&[
-                        ("application_name", ctx.console_application_name().as_str()),
-                        ("endpointish", endpoint.as_str()),
-                        ("role", role.as_str()),
-                    ])
-                    .build()?;
+            let request = self
+                .endpoint
+                .get_path("get_endpoint_access_control")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", session_id)])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                    ("role", user_info.user.as_str()),
+                ])
+                .build()?;

-                debug!(url = request.url().as_str(), "sending http request");
-                let start = Instant::now();
-                let _pause = ctx.latency_timer_pause_at(start, crate::metrics::Waiting::Cplane);
-                let response = self.endpoint.execute(request).await?;
-
-                info!(duration = ?start.elapsed(), "received http response");
-
-                response
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let response = match ctx {
+                Some(ctx) => {
+                    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+                    let rsp = self.endpoint.execute(request).await;
+                    drop(pause);
+                    rsp?
+                }
+                None => self.endpoint.execute(request).await?,
            };

-            let body = match parse_body::<GetEndpointAccessControl>(
-                response.status(),
-                response.bytes().await?,
-            ) {
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = match parse_body::<GetEndpointAccessControl>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
                // TODO(anna): retry
@@ -156,7 +180,7 @@ impl NeonControlPlaneClient {
    async fn do_get_endpoint_jwks(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
        if !self
            .caches
@@ -192,10 +216,7 @@ impl NeonControlPlaneClient {
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");

-            let body = parse_body::<EndpointJwksResponse>(
-                response.status(),
-                response.bytes().await.map_err(ControlPlaneError::from)?,
-            )?;
+            let body = parse_body::<EndpointJwksResponse>(response).await?;

            let rules = body
                .jwks
@@ -247,7 +268,7 @@ impl NeonControlPlaneClient {
            let response = self.endpoint.execute(request).await?;
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
-            let body = parse_body::<WakeCompute>(response.status(), response.bytes().await?)?;
+            let body = parse_body::<WakeCompute>(response).await?;

            // Unfortunately, ownership won't let us use `Option::ok_or` here.
            let (host, port) = match parse_host_port(&body.address) {
@@ -292,104 +313,225 @@ impl NeonControlPlaneClient {

 impl super::ControlPlaneApi for NeonControlPlaneClient {
    #[tracing::instrument(skip_all)]
-    async fn get_role_access_control(
+    async fn get_role_secret(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<RoleAccessControl, crate::control_plane::errors::GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(secret) = self
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        let user = &user_info.user;
+        if let Some(role_secret) = self
            .caches
            .project_info
-            .get_role_secret(normalized_ep, role)
+            .get_role_secret(normalized_ep, user)
        {
-            return Ok(secret);
+            return Ok(role_secret);
        }
-
-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let account_id = auth_info.account_id;
        if let Some(project_id) = auth_info.project_id {
            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
+            self.caches.project_info.insert_role_secret(
                project_id,
                normalized_ep_int,
-                role.into(),
-                control,
-                role_control.clone(),
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_ips),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_vpc_endpoint_ids),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                auth_info.access_blocker_flags,
            );
            ctx.set_project_id(project_id);
        }
-
-        Ok(role_control)
+        // When we just got a secret, we don't need to invalidate it.
+        Ok(Cached::new_uncached(auth_info.secret))
    }

-    #[tracing::instrument(skip_all)]
-    async fn get_endpoint_access_control(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<EndpointAccessControl, GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(control) = self.caches.project_info.get_endpoint_access(normalized_ep) {
-            return Ok(control);
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats?
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_ips);
        }
-
-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
        if let Some(project_id) = auth_info.project_id {
            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
+            self.caches.project_info.insert_role_secret(
                project_id,
                normalized_ep_int,
-                role.into(),
-                control.clone(),
-                role_control,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
            );
            ctx.set_project_id(project_id);
        }
+        Ok(Cached::new_uncached(allowed_ips))
+    }

-        Ok(control)
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_vpc_endpoint_ids) = self
+            .caches
+            .project_info
+            .get_allowed_vpc_endpoint_ids(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .vpc_endpoint_id_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_vpc_endpoint_ids);
+        }
+
+        Metrics::get()
+            .proxy
+            .vpc_endpoint_id_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_vpc_endpoint_ids))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(access_blocker_flags) = self
+            .caches
+            .project_info
+            .get_block_public_or_vpc_access(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .access_blocker_flags_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(access_blocker_flags);
+        }
+
+        Metrics::get()
+            .proxy
+            .access_blocker_flags_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(access_blocker_flags))
    }

    #[tracing::instrument(skip_all)]
    async fn get_endpoint_jwks(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
        self.do_get_endpoint_jwks(ctx, endpoint).await
    }
@@ -498,33 +640,33 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 }

 /// Parse http response body, taking status code into account.
-fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    status: StatusCode,
-    body: Bytes,
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
+    response: http::Response,
 ) -> Result<T, ControlPlaneError> {
+    let status = response.status();
    if status.is_success() {
        // We shouldn't log raw body because it may contain secrets.
        info!("request succeeded, processing the body");
-        return Ok(serde_json::from_slice(&body).map_err(std::io::Error::other)?);
+        return Ok(response.json().await?);
    }
-
+    let s = response.bytes().await?;
    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", body);
+    info!("response_error plaintext: {:?}", s);

    // Don't throw an error here because it's not as important
    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&body).unwrap_or_else(|e| {
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
        warn!("failed to parse error body: {e}");
-        Box::new(ControlPlaneErrorMessage {
+        ControlPlaneErrorMessage {
            error: "reason unclear (malformed error message)".into(),
            http_status_code: status,
            status: None,
-        })
+        }
    });
    body.http_status_code = status;

    warn!("console responded with an error ({status}): {body:?}");
-    Err(ControlPlaneError::Message(body))
+    Err(ControlPlaneError::Message(Box::new(body)))
 }

 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -15,14 +15,14 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::AuthRule;
 use crate::cache::Cached;
 use crate::context::RequestContext;
+use crate::control_plane::client::{
+    CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret,
+};
 use crate::control_plane::errors::{
    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::{
-    AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
-    RoleAccessControl,
-};
+use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 use crate::url::ApiUrl;
@@ -66,8 +66,7 @@ impl MockControlPlane {

    async fn do_get_auth_info(
        &self,
-        endpoint: &EndpointId,
-        role: &RoleName,
+        user_info: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        let (secret, allowed_ips) = async {
            // Perhaps we could persist this connection, but then we'd have to
@@ -81,7 +80,7 @@ impl MockControlPlane {
            let secret = if let Some(entry) = get_execute_postgres_query(
                &client,
                "select rolpassword from pg_catalog.pg_authid where rolname = $1",
-                &[&role.as_str()],
+                &[&&*user_info.user],
                "rolpassword",
            )
            .await?
@@ -90,7 +89,7 @@ impl MockControlPlane {
                let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
                secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
            } else {
-                warn!("user '{role}' does not exist");
+                warn!("user '{}' does not exist", user_info.user);
                None
            };

@@ -98,7 +97,7 @@ impl MockControlPlane {
                match get_execute_postgres_query(
                    &client,
                    "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                    &[&endpoint.as_str()],
+                    &[&user_info.endpoint.as_str()],
                    "allowed_ips",
                )
                .await?
@@ -134,7 +133,7 @@ impl MockControlPlane {

    async fn do_get_endpoint_jwks(
        &self,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
        let (client, connection) =
            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
@@ -223,36 +222,53 @@ async fn get_execute_postgres_query(
 }

 impl super::ControlPlaneApi for MockControlPlane {
-    async fn get_endpoint_access_control(
+    #[tracing::instrument(skip_all)]
+    async fn get_role_secret(
        &self,
        _ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<EndpointAccessControl, GetAuthInfoError> {
-        let info = self.do_get_auth_info(endpoint, role).await?;
-        Ok(EndpointAccessControl {
-            allowed_ips: Arc::new(info.allowed_ips),
-            allowed_vpce: Arc::new(info.allowed_vpc_endpoint_ids),
-            flags: info.access_blocker_flags,
-        })
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(user_info).await?.secret,
+        ))
    }

-    async fn get_role_access_control(
+    async fn get_allowed_ips(
        &self,
        _ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<RoleAccessControl, GetAuthInfoError> {
-        let info = self.do_get_auth_info(endpoint, role).await?;
-        Ok(RoleAccessControl {
-            secret: info.secret,
-        })
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info).await?.allowed_ips,
+        )))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info)
+                .await?
+                .allowed_vpc_endpoint_ids,
+        )))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<super::CachedAccessBlockerFlags, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(
+            self.do_get_auth_info(user_info).await?.access_blocker_flags,
+        ))
    }

    async fn get_endpoint_jwks(
        &self,
        _ctx: &RequestContext,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
        self.do_get_endpoint_jwks(endpoint).await
    }
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -16,14 +16,15 @@ use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
-use crate::control_plane::{CachedNodeInfo, ControlPlaneApi, NodeInfoCache, errors};
+use crate::control_plane::{
+    CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo,
+    CachedRoleSecret, ControlPlaneApi, NodeInfoCache, errors,
+};
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
 use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
 use crate::types::EndpointId;

-use super::{EndpointAccessControl, RoleAccessControl};
-
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
@@ -39,42 +40,68 @@ pub enum ControlPlaneClient {
 }

 impl ControlPlaneApi for ControlPlaneClient {
-    async fn get_role_access_control(
+    async fn get_role_secret(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &crate::types::RoleName,
-    ) -> Result<RoleAccessControl, errors::GetAuthInfoError> {
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        match self {
-            Self::ProxyV1(api) => api.get_role_access_control(ctx, endpoint, role).await,
+            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_role_access_control(ctx, endpoint, role).await,
+            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
-            Self::Test(_api) => {
+            Self::Test(_) => {
                unreachable!("this function should never be called in the test backend")
            }
        }
    }

-    async fn get_endpoint_access_control(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &crate::types::RoleName,
-    ) -> Result<EndpointAccessControl, errors::GetAuthInfoError> {
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
        match self {
-            Self::ProxyV1(api) => api.get_endpoint_access_control(ctx, endpoint, role).await,
+            Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_endpoint_access_control(ctx, endpoint, role).await,
+            Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await,
            #[cfg(test)]
-            Self::Test(api) => api.get_access_control(),
+            Self::Test(api) => api.get_allowed_ips(),
+        }
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_vpc_endpoint_ids(),
+        }
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_block_public_or_vpc_access(),
        }
    }

    async fn get_endpoint_jwks(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
        match self {
            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -104,7 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient {
 pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;

-    fn get_access_control(&self) -> Result<EndpointAccessControl, errors::GetAuthInfoError>;
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    fn get_allowed_vpc_endpoint_ids(
+        &self,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;

    fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
 }
@@ -274,7 +309,7 @@ impl FetchAuthRules for ControlPlaneClient {
        ctx: &RequestContext,
        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
-        self.get_endpoint_jwks(ctx, &endpoint)
+        self.get_endpoint_jwks(ctx, endpoint)
            .await
            .map_err(FetchAuthRulesError::GetEndpointJwks)
    }
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -99,10 +99,6 @@ pub(crate) enum GetAuthInfoError {

    #[error(transparent)]
    ApiError(ControlPlaneError),
-
-    /// Proxy does not know about the endpoint in advanced
-    #[error("endpoint not found in endpoint cache")]
-    UnknownEndpoint,
 }

 // This allows more useful interactions than `#[from]`.
@@ -119,8 +115,6 @@ impl UserFacingError for GetAuthInfoError {
            Self::BadSecret => REQUEST_FAILED.to_owned(),
            // However, API might return a meaningful error.
            Self::ApiError(e) => e.to_string_client(),
-            // pretend like control plane returned an error.
-            Self::UnknownEndpoint => REQUEST_FAILED.to_owned(),
        }
    }
 }
@@ -130,8 +124,6 @@ impl ReportableError for GetAuthInfoError {
        match self {
            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
-            // we only apply endpoint filtering if control plane is under high load.
-            Self::UnknownEndpoint => crate::error::ErrorKind::ServiceRateLimit,
        }
    }
 }
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -11,16 +11,16 @@ pub(crate) mod errors;

 use std::sync::Arc;

+use crate::auth::IpPattern;
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
-use crate::auth::{AuthError, IpPattern, check_peer_addr_is_in_list};
+use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
 use crate::intern::{AccountIdInt, ProjectIdInt};
-use crate::protocol2::ConnectionInfoExtra;
-use crate::types::{EndpointCacheKey, EndpointId, RoleName};
+use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};

 /// Various cache-related types.
@@ -101,7 +101,7 @@ impl NodeInfo {
    }
 }

-#[derive(Copy, Clone, Default)]
+#[derive(Clone, Default, Eq, PartialEq, Debug)]
 pub(crate) struct AccessBlockerFlags {
    pub public_access_blocked: bool,
    pub vpc_access_blocked: bool,
@@ -110,78 +110,47 @@ pub(crate) struct AccessBlockerFlags {
 pub(crate) type NodeInfoCache =
    TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
-
-#[derive(Clone)]
-pub struct RoleAccessControl {
-    pub secret: Option<AuthSecret>,
-}
-
-#[derive(Clone)]
-pub struct EndpointAccessControl {
-    pub allowed_ips: Arc<Vec<IpPattern>>,
-    pub allowed_vpce: Arc<Vec<String>>,
-    pub flags: AccessBlockerFlags,
-}
-
-impl EndpointAccessControl {
-    pub fn check(
-        &self,
-        ctx: &RequestContext,
-        check_ip_allowed: bool,
-        check_vpc_allowed: bool,
-    ) -> Result<(), AuthError> {
-        if check_ip_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &self.allowed_ips) {
-            return Err(AuthError::IpAddressNotAllowed(ctx.peer_addr()));
-        }
-
-        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
-        if check_vpc_allowed {
-            if self.flags.vpc_access_blocked {
-                return Err(AuthError::NetworkNotAllowed);
-            }
-
-            let incoming_vpc_endpoint_id = match ctx.extra() {
-                None => return Err(AuthError::MissingVPCEndpointId),
-                Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
-                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
-            };
-
-            let vpce = &self.allowed_vpce;
-            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
-            if !vpce.is_empty() && !vpce.contains(&incoming_vpc_endpoint_id) {
-                return Err(AuthError::vpc_endpoint_id_not_allowed(
-                    incoming_vpc_endpoint_id,
-                ));
-            }
-        } else if self.flags.public_access_blocked {
-            return Err(AuthError::NetworkNotAllowed);
-        }
-
-        Ok(())
-    }
-}
+pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type CachedAllowedVpcEndpointIds =
+    Cached<&'static ProjectInfoCacheImpl, Arc<Vec<String>>>;
+pub(crate) type CachedAccessBlockerFlags =
+    Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>;

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
 pub(crate) trait ControlPlaneApi {
-    async fn get_role_access_control(
+    /// Get the client's auth secret for authentication.
+    /// Returns option because user not found situation is special.
+    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
+    async fn get_role_secret(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<RoleAccessControl, errors::GetAuthInfoError>;
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

-    async fn get_endpoint_access_control(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        role: &RoleName,
-    ) -> Result<EndpointAccessControl, errors::GetAuthInfoError>;
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;

    async fn get_endpoint_jwks(
        &self,
        ctx: &RequestContext,
-        endpoint: &EndpointId,
+        endpoint: EndpointId,
    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;

    /// Wake up the compute node and return the corresponding connection info.
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -4,10 +4,9 @@

 pub mod health_server;

-use std::time::{Duration, Instant};
+use std::time::Duration;

 use bytes::Bytes;
-use futures::FutureExt;
 use http::Method;
 use http_body_util::BodyExt;
 use hyper::body::Body;
@@ -110,31 +109,15 @@ impl Endpoint {
    }

    /// Execute a [request](reqwest::Request).
-    pub(crate) fn execute(
-        &self,
-        request: Request,
-    ) -> impl Future<Output = Result<Response, Error>> {
-        let metric = Metrics::get()
+    pub(crate) async fn execute(&self, request: Request) -> Result<Response, Error> {
+        let _timer = Metrics::get()
            .proxy
            .console_request_latency
-            .with_labels(ConsoleRequest {
+            .start_timer(ConsoleRequest {
                request: request.url().path(),
            });

-        let req = self.client.execute(request).boxed();
-
-        async move {
-            let start = Instant::now();
-            scopeguard::defer!({
-                Metrics::get()
-                    .proxy
-                    .console_request_latency
-                    .get_metric(metric)
-                    .observe_duration_since(start);
-            });
-
-            req.await
-        }
+        self.client.execute(request).await
    }
 }

--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -92,7 +92,6 @@ mod logging;
 mod metrics;
 mod parse;
 mod pglb;
-mod pqproto;
 mod protocol2;
 mod proxy;
 mod rate_limiter;
--- a/proxy/src/pqproto.rs
+++ b/proxy/src/pqproto.rs
@@ -1,693 +0,0 @@
-//! Postgres protocol codec
-//!
-//! <https://www.postgresql.org/docs/current/protocol-message-formats.html>
-
-use std::fmt;
-use std::io::{self, Cursor};
-
-use bytes::{Buf, BufMut};
-use itertools::Itertools;
-use rand::distributions::{Distribution, Standard};
-use tokio::io::{AsyncRead, AsyncReadExt};
-use zerocopy::{FromBytes, Immutable, IntoBytes, big_endian};
-
-pub type ErrorCode = [u8; 5];
-
-pub const FE_PASSWORD_MESSAGE: u8 = b'p';
-
-pub const SQLSTATE_INTERNAL_ERROR: [u8; 5] = *b"XX000";
-
-/// The protocol version number.
-///
-/// The most significant 16 bits are the major version number (3 for the protocol described here).
-/// The least significant 16 bits are the minor version number (0 for the protocol described here).
-/// <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-STARTUPMESSAGE>
-#[derive(Clone, Copy, PartialEq, PartialOrd, FromBytes, IntoBytes, Immutable)]
-#[repr(C)]
-pub struct ProtocolVersion {
-    major: big_endian::U16,
-    minor: big_endian::U16,
-}
-
-impl ProtocolVersion {
-    pub const fn new(major: u16, minor: u16) -> Self {
-        Self {
-            major: big_endian::U16::new(major),
-            minor: big_endian::U16::new(minor),
-        }
-    }
-    pub const fn minor(self) -> u16 {
-        self.minor.get()
-    }
-    pub const fn major(self) -> u16 {
-        self.major.get()
-    }
-}
-
-impl fmt::Debug for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list()
-            .entry(&self.major())
-            .entry(&self.minor())
-            .finish()
-    }
-}
-
-/// read the type from the stream using zerocopy.
-///
-/// not cancel safe.
-macro_rules! read {
-    ($s:expr => $t:ty) => {{
-        // cannot be implemented as a function due to lack of const-generic-expr
-        let mut buf = [0; size_of::<$t>()];
-        $s.read_exact(&mut buf).await?;
-        let res: $t = zerocopy::transmute!(buf);
-        res
-    }};
-}
-
-pub async fn read_startup<S>(stream: &mut S) -> io::Result<FeStartupPacket>
-where
-    S: AsyncRead + Unpin,
-{
-    /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
-    const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-    const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-    /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-    const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-    /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
-    const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
-    /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
-    const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
-
-    /// This first reads the startup message header, is 8 bytes.
-    /// The first 4 bytes is a big-endian message length, and the next 4 bytes is a version number.
-    ///
-    /// The length value is inclusive of the header. For example,
-    /// an empty message will always have length 8.
-    #[derive(Clone, Copy, FromBytes, IntoBytes, Immutable)]
-    #[repr(C)]
-    struct StartupHeader {
-        len: big_endian::U32,
-        version: ProtocolVersion,
-    }
-
-    let header = read!(stream => StartupHeader);
-
-    // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
-    // First byte indicates standard SSL handshake message
-    // (It can't be a Postgres startup length because in network byte order
-    // that would be a startup packet hundreds of megabytes long)
-    if header.as_bytes()[0] == 0x16 {
-        return Ok(FeStartupPacket::SslRequest {
-            // The bytes we read for the header are actually part of a TLS ClientHello.
-            // In theory, if the ClientHello was < 8 bytes we would fail with EOF before we get here.
-            // In practice though, I see no world where a ClientHello is less than 8 bytes
-            // since it includes ephemeral keys etc.
-            direct: Some(zerocopy::transmute!(header)),
-        });
-    }
-
-    let Some(len) = (header.len.get() as usize).checked_sub(8) else {
-        return Err(io::Error::other(format!(
-            "invalid startup message length {}, must be at least 8.",
-            header.len,
-        )));
-    };
-
-    // TODO: add a histogram for startup packet lengths
-    if len > MAX_STARTUP_PACKET_LENGTH {
-        tracing::warn!("large startup message detected: {len} bytes");
-        return Err(io::Error::other(format!(
-            "invalid startup message length {len}"
-        )));
-    }
-
-    match header.version {
-        // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-CANCELREQUEST>
-        CANCEL_REQUEST_CODE => {
-            if len != 8 {
-                return Err(io::Error::other(
-                    "CancelRequest message is malformed, backend PID / secret key missing",
-                ));
-            }
-
-            Ok(FeStartupPacket::CancelRequest(
-                read!(stream => CancelKeyData),
-            ))
-        }
-        // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-SSLREQUEST>
-        NEGOTIATE_SSL_CODE => {
-            // Requested upgrade to SSL (aka TLS)
-            Ok(FeStartupPacket::SslRequest { direct: None })
-        }
-        NEGOTIATE_GSS_CODE => {
-            // Requested upgrade to GSSAPI
-            Ok(FeStartupPacket::GssEncRequest)
-        }
-        version if version.major() == RESERVED_INVALID_MAJOR_VERSION => Err(io::Error::other(
-            format!("Unrecognized request code {version:?}"),
-        )),
-        // StartupMessage
-        version => {
-            // The protocol version number is followed by one or more pairs of parameter name and value strings.
-            // A zero byte is required as a terminator after the last name/value pair.
-            // Parameters can appear in any order. user is required, others are optional.
-
-            let mut buf = vec![0; len];
-            stream.read_exact(&mut buf).await?;
-
-            if buf.pop() != Some(b'\0') {
-                return Err(io::Error::other(
-                    "StartupMessage params: missing null terminator",
-                ));
-            }
-
-            // TODO: Don't do this.
-            // There's no guarantee that these messages are utf8,
-            // but they usually happen to be simple ascii.
-            let params = String::from_utf8(buf)
-                .map_err(|_| io::Error::other("StartupMessage params: invalid utf-8"))?;
-
-            Ok(FeStartupPacket::StartupMessage {
-                version,
-                params: StartupMessageParams { params },
-            })
-        }
-    }
-}
-
-/// Read a raw postgres packet, which will respect the max length requested.
-///
-/// This returns the message tag, as well as the message body. The message
-/// body is written into `buf`, and it is otherwise completely overwritten.
-///
-/// This is not cancel safe.
-pub async fn read_message<'a, S>(
-    stream: &mut S,
-    buf: &'a mut Vec<u8>,
-    max: u32,
-) -> io::Result<(u8, &'a mut [u8])>
-where
-    S: AsyncRead + Unpin,
-{
-    /// This first reads the header, which for regular messages in the 3.0 protocol is 5 bytes.
-    /// The first byte is a message tag, and the next 4 bytes is a big-endian length.
-    ///
-    /// Awkwardly, the length value is inclusive of itself, but not of the tag. For example,
-    /// an empty message will always have length 4.
-    #[derive(Clone, Copy, FromBytes)]
-    #[repr(C)]
-    struct Header {
-        tag: u8,
-        len: big_endian::U32,
-    }
-
-    let header = read!(stream => Header);
-
-    // as described above, the length must be at least 4.
-    let Some(len) = header.len.get().checked_sub(4) else {
-        return Err(io::Error::other(format!(
-            "invalid startup message length {}, must be at least 4.",
-            header.len,
-        )));
-    };
-
-    // TODO: add a histogram for message lengths
-
-    // check if the message exceeds our desired max.
-    if len > max {
-        tracing::warn!("large postgres message detected: {len} bytes");
-        return Err(io::Error::other(format!("invalid message length {len}")));
-    }
-
-    // read in our entire message.
-    buf.resize(len as usize, 0);
-    stream.read_exact(buf).await?;
-
-    Ok((header.tag, buf))
-}
-
-pub struct WriteBuf(Cursor<Vec<u8>>);
-
-impl Buf for WriteBuf {
-    #[inline]
-    fn remaining(&self) -> usize {
-        self.0.remaining()
-    }
-
-    #[inline]
-    fn chunk(&self) -> &[u8] {
-        self.0.chunk()
-    }
-
-    #[inline]
-    fn advance(&mut self, cnt: usize) {
-        self.0.advance(cnt);
-    }
-}
-
-impl WriteBuf {
-    pub const fn new() -> Self {
-        Self(Cursor::new(Vec::new()))
-    }
-
-    /// Use a heuristic to determine if we should shrink the write buffer.
-    #[inline]
-    fn should_shrink(&self) -> bool {
-        let n = self.0.position() as usize;
-        let len = self.0.get_ref().len();
-
-        // the unused space at the front of our buffer is 2x the size of our filled portion.
-        n + n > len
-    }
-
-    /// Shrink the write buffer so that subsequent writes have more spare capacity.
-    #[cold]
-    fn shrink(&mut self) {
-        let n = self.0.position() as usize;
-        let buf = self.0.get_mut();
-
-        // buf repr:
-        // [----unused------|-----filled-----|-----uninit-----]
-        //                  ^ n              ^ buf.len()      ^ buf.capacity()
-        let filled = n..buf.len();
-        let filled_len = filled.len();
-        buf.copy_within(filled, 0);
-        buf.truncate(filled_len);
-        self.0.set_position(0);
-    }
-
-    /// clear the write buffer.
-    pub fn reset(&mut self) {
-        let buf = self.0.get_mut();
-        buf.clear();
-        self.0.set_position(0);
-    }
-
-    /// Write a raw message to the internal buffer.
-    ///
-    /// The size_hint value is only a hint for reserving space. It's ok if it's incorrect, since
-    /// we calculate the length after the fact.
-    pub fn write_raw(&mut self, size_hint: usize, tag: u8, f: impl FnOnce(&mut Vec<u8>)) {
-        if self.should_shrink() {
-            self.shrink();
-        }
-
-        let buf = self.0.get_mut();
-        buf.reserve(5 + size_hint);
-
-        buf.push(tag);
-        let start = buf.len();
-        buf.extend_from_slice(&[0, 0, 0, 0]);
-
-        f(buf);
-
-        let end = buf.len();
-        let len = (end - start) as u32;
-        buf[start..start + 4].copy_from_slice(&len.to_be_bytes());
-    }
-
-    /// Write an encryption response message.
-    pub fn encryption(&mut self, m: u8) {
-        self.0.get_mut().push(m);
-    }
-
-    pub fn write_error(&mut self, msg: &str, error_code: ErrorCode) {
-        self.shrink();
-
-        // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-ERRORRESPONSE>
-        // <https://www.postgresql.org/docs/current/protocol-error-fields.html>
-        // "SERROR\0CXXXXX\0M\0\0".len() == 17
-        self.write_raw(17 + msg.len(), b'E', |buf| {
-            // Severity: ERROR
-            buf.put_slice(b"SERROR\0");
-
-            // Code: error_code
-            buf.put_u8(b'C');
-            buf.put_slice(&error_code);
-            buf.put_u8(0);
-
-            // Message: msg
-            buf.put_u8(b'M');
-            buf.put_slice(msg.as_bytes());
-            buf.put_u8(0);
-
-            // End.
-            buf.put_u8(0);
-        });
-    }
-}
-
-#[derive(Debug)]
-pub enum FeStartupPacket {
-    CancelRequest(CancelKeyData),
-    SslRequest {
-        direct: Option<[u8; 8]>,
-    },
-    GssEncRequest,
-    StartupMessage {
-        version: ProtocolVersion,
-        params: StartupMessageParams,
-    },
-}
-
-#[derive(Debug, Clone, Default)]
-pub struct StartupMessageParams {
-    pub params: String,
-}
-
-impl StartupMessageParams {
-    /// Get parameter's value by its name.
-    pub fn get(&self, name: &str) -> Option<&str> {
-        self.iter().find_map(|(k, v)| (k == name).then_some(v))
-    }
-
-    /// Split command-line options according to PostgreSQL's logic,
-    /// taking into account all escape sequences but leaving them as-is.
-    /// [`None`] means that there's no `options` in [`Self`].
-    pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
-        self.get("options").map(Self::parse_options_raw)
-    }
-
-    /// Split command-line options according to PostgreSQL's logic,
-    /// taking into account all escape sequences but leaving them as-is.
-    pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
-        // See `postgres: pg_split_opts`.
-        let mut last_was_escape = false;
-        input
-            .split(move |c: char| {
-                // We split by non-escaped whitespace symbols.
-                let should_split = c.is_ascii_whitespace() && !last_was_escape;
-                last_was_escape = c == '\\' && !last_was_escape;
-                should_split
-            })
-            .filter(|s| !s.is_empty())
-    }
-
-    /// Iterate through key-value pairs in an arbitrary order.
-    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        self.params.split_terminator('\0').tuples()
-    }
-
-    // This function is mostly useful in tests.
-    #[cfg(test)]
-    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        let mut b = Self {
-            params: String::new(),
-        };
-        for (k, v) in pairs {
-            b.insert(k, v);
-        }
-        b
-    }
-
-    /// Set parameter's value by its name.
-    /// name and value must not contain a \0 byte
-    pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.reserve(name.len() + value.len() + 2);
-        self.params.push_str(name);
-        self.params.push('\0');
-        self.params.push_str(value);
-        self.params.push('\0');
-    }
-}
-
-/// Cancel keys usually are represented as PID+SecretKey, but to proxy they're just
-/// opaque bytes.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, FromBytes, IntoBytes, Immutable)]
-pub struct CancelKeyData(pub big_endian::U64);
-
-pub fn id_to_cancel_key(id: u64) -> CancelKeyData {
-    CancelKeyData(big_endian::U64::new(id))
-}
-
-impl fmt::Display for CancelKeyData {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let id = self.0;
-        f.debug_tuple("CancelKeyData")
-            .field(&format_args!("{id:x}"))
-            .finish()
-    }
-}
-impl Distribution<CancelKeyData> for Standard {
-    fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
-        id_to_cancel_key(rng.r#gen())
-    }
-}
-
-pub enum BeMessage<'a> {
-    AuthenticationOk,
-    AuthenticationSasl(BeAuthenticationSaslMessage<'a>),
-    AuthenticationCleartextPassword,
-    BackendKeyData(CancelKeyData),
-    ParameterStatus {
-        name: &'a [u8],
-        value: &'a [u8],
-    },
-    ReadyForQuery,
-    NoticeResponse(&'a str),
-    NegotiateProtocolVersion {
-        version: ProtocolVersion,
-        options: &'a [&'a str],
-    },
-}
-
-#[derive(Debug)]
-pub enum BeAuthenticationSaslMessage<'a> {
-    Methods(&'a [&'a str]),
-    Continue(&'a [u8]),
-    Final(&'a [u8]),
-}
-
-impl BeMessage<'_> {
-    /// Write the message into an internal buffer
-    pub fn write_message(self, buf: &mut WriteBuf) {
-        match self {
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-AUTHENTICATIONCLEARTEXTPASSWORD>
-            BeMessage::AuthenticationOk => {
-                buf.write_raw(1, b'R', |buf| buf.put_i32(0));
-            }
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-AUTHENTICATIONCLEARTEXTPASSWORD>
-            BeMessage::AuthenticationCleartextPassword => {
-                buf.write_raw(1, b'R', |buf| buf.put_i32(3));
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-AUTHENTICATIONSASL>
-            BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(methods)) => {
-                let len: usize = methods.iter().map(|m| m.len() + 1).sum();
-                buf.write_raw(len + 2, b'R', |buf| {
-                    buf.put_i32(10); // Specifies that SASL auth method is used.
-                    for method in methods {
-                        buf.put_slice(method.as_bytes());
-                        buf.put_u8(0);
-                    }
-                    buf.put_u8(0); // zero terminator for the list
-                });
-            }
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-AUTHENTICATIONSASL>
-            BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Continue(extra)) => {
-                buf.write_raw(extra.len() + 1, b'R', |buf| {
-                    buf.put_i32(11); // Continue SASL auth.
-                    buf.put_slice(extra);
-                });
-            }
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-AUTHENTICATIONSASL>
-            BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Final(extra)) => {
-                buf.write_raw(extra.len() + 1, b'R', |buf| {
-                    buf.put_i32(12); // Send final SASL message.
-                    buf.put_slice(extra);
-                });
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-BACKENDKEYDATA>
-            BeMessage::BackendKeyData(key_data) => {
-                buf.write_raw(8, b'K', |buf| buf.put_slice(key_data.as_bytes()));
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-NOTICERESPONSE>
-            // <https://www.postgresql.org/docs/current/protocol-error-fields.html>
-            BeMessage::NoticeResponse(msg) => {
-                // 'N' signalizes NoticeResponse messages
-                buf.write_raw(18 + msg.len(), b'N', |buf| {
-                    // Severity: NOTICE
-                    buf.put_slice(b"SNOTICE\0");
-
-                    // Code: XX000 (ignored for notice, but still required)
-                    buf.put_slice(b"CXX000\0");
-
-                    // Message: msg
-                    buf.put_u8(b'M');
-                    buf.put_slice(msg.as_bytes());
-                    buf.put_u8(0);
-
-                    // End notice.
-                    buf.put_u8(0);
-                });
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-PARAMETERSTATUS>
-            BeMessage::ParameterStatus { name, value } => {
-                buf.write_raw(name.len() + value.len() + 2, b'S', |buf| {
-                    buf.put_slice(name.as_bytes());
-                    buf.put_u8(0);
-                    buf.put_slice(value.as_bytes());
-                    buf.put_u8(0);
-                });
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-NEGOTIATEPROTOCOLVERSION>
-            BeMessage::ReadyForQuery => {
-                buf.write_raw(1, b'Z', |buf| buf.put_u8(b'I'));
-            }
-
-            // <https://www.postgresql.org/docs/current/protocol-message-formats.html#PROTOCOL-MESSAGE-FORMATS-NEGOTIATEPROTOCOLVERSION>
-            BeMessage::NegotiateProtocolVersion { version, options } => {
-                let len: usize = options.iter().map(|o| o.len() + 1).sum();
-                buf.write_raw(8 + len, b'v', |buf| {
-                    buf.put_slice(version.as_bytes());
-                    buf.put_u32(options.len() as u32);
-                    for option in options {
-                        buf.put_slice(option.as_bytes());
-                        buf.put_u8(0);
-                    }
-                });
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io::Cursor;
-
-    use tokio::io::{AsyncWriteExt, duplex};
-    use zerocopy::IntoBytes;
-
-    use crate::pqproto::{FeStartupPacket, read_message, read_startup};
-
-    use super::ProtocolVersion;
-
-    #[tokio::test]
-    async fn reject_large_startup() {
-        // we're going to define a v3.0 startup message with far too many parameters.
-        let mut payload = vec![];
-        // 10001 + 8 bytes.
-        payload.extend_from_slice(&10009_u32.to_be_bytes());
-        payload.extend_from_slice(ProtocolVersion::new(3, 0).as_bytes());
-        payload.resize(10009, b'a');
-
-        let (mut server, mut client) = duplex(128);
-        #[rustfmt::skip]
-        let (server, client) = tokio::join!(
-            async move { read_startup(&mut server).await.unwrap_err() },
-            async move { client.write_all(&payload).await.unwrap_err() },
-        );
-
-        assert_eq!(server.to_string(), "invalid startup message length 10001");
-        assert_eq!(client.to_string(), "broken pipe");
-    }
-
-    #[tokio::test]
-    async fn reject_large_password() {
-        // we're going to define a password message that is far too long.
-        let mut payload = vec![];
-        payload.push(b'p');
-        payload.extend_from_slice(&517_u32.to_be_bytes());
-        payload.resize(518, b'a');
-
-        let (mut server, mut client) = duplex(128);
-        #[rustfmt::skip]
-        let (server, client) = tokio::join!(
-            async move { read_message(&mut server, &mut vec![], 512).await.unwrap_err() },
-            async move { client.write_all(&payload).await.unwrap_err() },
-        );
-
-        assert_eq!(server.to_string(), "invalid message length 513");
-        assert_eq!(client.to_string(), "broken pipe");
-    }
-
-    #[tokio::test]
-    async fn read_startup_message() {
-        let mut payload = vec![];
-        payload.extend_from_slice(&17_u32.to_be_bytes());
-        payload.extend_from_slice(ProtocolVersion::new(3, 0).as_bytes());
-        payload.extend_from_slice(b"abc\0def\0\0");
-
-        let startup = read_startup(&mut Cursor::new(&payload)).await.unwrap();
-        let FeStartupPacket::StartupMessage { version, params } = startup else {
-            panic!("unexpected startup message: {startup:?}");
-        };
-
-        assert_eq!(version.major(), 3);
-        assert_eq!(version.minor(), 0);
-        assert_eq!(params.params, "abc\0def\0");
-    }
-
-    #[tokio::test]
-    async fn read_ssl_message() {
-        let mut payload = vec![];
-        payload.extend_from_slice(&8_u32.to_be_bytes());
-        payload.extend_from_slice(ProtocolVersion::new(1234, 5679).as_bytes());
-
-        let startup = read_startup(&mut Cursor::new(&payload)).await.unwrap();
-        let FeStartupPacket::SslRequest { direct: None } = startup else {
-            panic!("unexpected startup message: {startup:?}");
-        };
-    }
-
-    #[tokio::test]
-    async fn read_tls_message() {
-        // sample client hello taken from <https://tls13.xargs.org/#client-hello>
-        let client_hello = [
-            0x16, 0x03, 0x01, 0x00, 0xf8, 0x01, 0x00, 0x00, 0xf4, 0x03, 0x03, 0x00, 0x01, 0x02,
-            0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
-            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e,
-            0x1f, 0x20, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
-            0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9,
-            0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x08, 0x13, 0x02, 0x13, 0x03, 0x13, 0x01,
-            0x00, 0xff, 0x01, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x18, 0x00, 0x16, 0x00, 0x00,
-            0x13, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x75, 0x6c, 0x66, 0x68, 0x65,
-            0x69, 0x6d, 0x2e, 0x6e, 0x65, 0x74, 0x00, 0x0b, 0x00, 0x04, 0x03, 0x00, 0x01, 0x02,
-            0x00, 0x0a, 0x00, 0x16, 0x00, 0x14, 0x00, 0x1d, 0x00, 0x17, 0x00, 0x1e, 0x00, 0x19,
-            0x00, 0x18, 0x01, 0x00, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03, 0x01, 0x04, 0x00, 0x23,
-            0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x1e,
-            0x00, 0x1c, 0x04, 0x03, 0x05, 0x03, 0x06, 0x03, 0x08, 0x07, 0x08, 0x08, 0x08, 0x09,
-            0x08, 0x0a, 0x08, 0x0b, 0x08, 0x04, 0x08, 0x05, 0x08, 0x06, 0x04, 0x01, 0x05, 0x01,
-            0x06, 0x01, 0x00, 0x2b, 0x00, 0x03, 0x02, 0x03, 0x04, 0x00, 0x2d, 0x00, 0x02, 0x01,
-            0x01, 0x00, 0x33, 0x00, 0x26, 0x00, 0x24, 0x00, 0x1d, 0x00, 0x20, 0x35, 0x80, 0x72,
-            0xd6, 0x36, 0x58, 0x80, 0xd1, 0xae, 0xea, 0x32, 0x9a, 0xdf, 0x91, 0x21, 0x38, 0x38,
-            0x51, 0xed, 0x21, 0xa2, 0x8e, 0x3b, 0x75, 0xe9, 0x65, 0xd0, 0xd2, 0xcd, 0x16, 0x62,
-            0x54,
-        ];
-
-        let mut cursor = Cursor::new(&client_hello);
-
-        let startup = read_startup(&mut cursor).await.unwrap();
-        let FeStartupPacket::SslRequest {
-            direct: Some(prefix),
-        } = startup
-        else {
-            panic!("unexpected startup message: {startup:?}");
-        };
-
-        // check that no data is lost.
-        assert_eq!(prefix, [0x16, 0x03, 0x01, 0x00, 0xf8, 0x01, 0x00, 0x00]);
-        assert_eq!(cursor.position(), 8);
-    }
-
-    #[tokio::test]
-    async fn read_message_success() {
-        let query = b"Q\0\0\0\x0cSELECT 1Q\0\0\0\x0cSELECT 2";
-        let mut cursor = Cursor::new(&query);
-
-        let mut buf = vec![];
-        let (tag, message) = read_message(&mut cursor, &mut buf, 100).await.unwrap();
-        assert_eq!(tag, b'Q');
-        assert_eq!(message, b"SELECT 1");
-
-        let (tag, message) = read_message(&mut cursor, &mut buf, 100).await.unwrap();
-        assert_eq!(tag, b'Q');
-        assert_eq!(message, b"SELECT 2");
-    }
-}
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,4 +1,5 @@
 use async_trait::async_trait;
+use pq_proto::StartupMessageParams;
 use tokio::time;
 use tracing::{debug, info, warn};

@@ -14,7 +15,6 @@ use crate::error::ReportableError;
 use crate::metrics::{
    ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
 };
-use crate::pqproto::StartupMessageParams;
 use crate::proxy::retry::{CouldRetry, retry_after, should_retry};
 use crate::proxy::wake_compute::wake_compute;
 use crate::types::Host;
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,4 +1,8 @@
-use futures::{FutureExt, TryFutureExt};
+use bytes::Buf;
+use pq_proto::framed::Framed;
+use pq_proto::{
+    BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams,
+};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, warn};
@@ -8,10 +12,7 @@ use crate::config::TlsConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::Metrics;
-use crate::pqproto::{
-    BeMessage, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams,
-};
-use crate::proxy::TlsRequired;
+use crate::proxy::ERR_INSECURE_CONNECTION;
 use crate::stream::{PqStream, Stream, StreamUpgradeError};
 use crate::tls::PG_ALPN_PROTOCOL;

@@ -58,7 +59,7 @@ pub(crate) enum HandshakeData<S> {
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
-pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
+pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    ctx: &RequestContext,
    stream: S,
    mut tls: Option<&TlsConfig>,
@@ -70,25 +71,33 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);

-    let (mut stream, mut msg) = PqStream::parse_startup(Stream::from_raw(stream)).await?;
+    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
+        let msg = stream.read_startup_packet().await?;
        match msg {
            FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
                Stream::Raw { .. } if !tried_ssl => {
                    tried_ssl = true;

+                    // We can't perform TLS handshake without a config
+                    let have_tls = tls.is_some();
+                    if !direct {
+                        stream
+                            .write_message(&Be::EncryptionResponse(have_tls))
+                            .await?;
+                    } else if !have_tls {
+                        return Err(HandshakeError::ProtocolViolation);
+                    }
+
                    if let Some(tls) = tls.take() {
                        // Upgrade raw stream into a secure TLS-backed stream.
                        // NOTE: We've consumed `tls`; this fact will be used later.

-                        let mut read_buf;
-                        let raw = if let Some(direct) = &direct {
-                            read_buf = &direct[..];
-                            stream.accept_direct_tls()
-                        } else {
-                            read_buf = &[];
-                            stream.accept_tls().await?
-                        };
+                        let Framed {
+                            stream: raw,
+                            read_buf,
+                            write_buf,
+                        } = stream.framed;

                        let Stream::Raw { raw } = raw else {
                            return Err(HandshakeError::StreamUpgradeError(
@@ -96,11 +105,12 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                            ));
                        };

+                        let mut read_buf = read_buf.reader();
                        let mut res = Ok(());
                        let accept = tokio_rustls::TlsAcceptor::from(tls.pg_config.clone())
                            .accept_with(raw, |session| {
                                // push the early data to the tls session
-                                while !read_buf.is_empty() {
+                                while !read_buf.get_ref().is_empty() {
                                    match session.read_tls(&mut read_buf) {
                                        Ok(_) => {}
                                        Err(e) => {
@@ -109,12 +119,11 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                                        }
                                    }
                                }
-                            })
-                            .map_ok(Box::new)
-                            .boxed();
+                            });

                        res?;

+                        let read_buf = read_buf.into_inner();
                        if !read_buf.is_empty() {
                            return Err(HandshakeError::EarlyData);
                        }
@@ -148,17 +157,16 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                        let (_, tls_server_end_point) =
                            tls.cert_resolver.resolve(conn_info.server_name());

-                        let tls = Stream::Tls {
-                            tls: tls_stream,
-                            tls_server_end_point,
+                        stream = PqStream {
+                            framed: Framed {
+                                stream: Stream::Tls {
+                                    tls: Box::new(tls_stream),
+                                    tls_server_end_point,
+                                },
+                                read_buf,
+                                write_buf,
+                            },
                        };
-                        (stream, msg) = PqStream::parse_startup(tls).await?;
-                    } else {
-                        if direct.is_some() {
-                            // client sent us a ClientHello already, we can't do anything with it.
-                            return Err(HandshakeError::ProtocolViolation);
-                        }
-                        msg = stream.reject_encryption().await?;
                    }
                }
                _ => return Err(HandshakeError::ProtocolViolation),
@@ -168,7 +176,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                    tried_gss = true;

                    // Currently, we don't support GSSAPI
-                    msg = stream.reject_encryption().await?;
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
                }
                _ => return Err(HandshakeError::ProtocolViolation),
            },
@@ -178,7 +186,13 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                // Check that the config has been consumed during upgrade
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
-                    Err(stream.throw_error(TlsRequired, None).await)?;
+                    return stream
+                        .throw_error_str(
+                            ERR_INSECURE_CONNECTION,
+                            crate::error::ErrorKind::User,
+                            None,
+                        )
+                        .await?;
                }

                // This log highlights the start of the connection.
@@ -200,21 +214,20 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                // no protocol extensions are supported.
                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
                let mut unsupported = vec![];
-                let mut supported = StartupMessageParams::default();
-
-                for (k, v) in params.iter() {
+                for (k, _) in params.iter() {
                    if k.starts_with("_pq_.") {
                        unsupported.push(k);
-                    } else {
-                        supported.insert(k, v);
                    }
                }

-                stream.write_message(BeMessage::NegotiateProtocolVersion {
-                    version: PG_PROTOCOL_LATEST,
-                    options: &unsupported,
-                });
-                stream.flush().await?;
+                // TODO: remove unsupported options so we don't send them to compute.
+
+                stream
+                    .write_message(&Be::NegotiateProtocolVersion {
+                        version: PG_PROTOCOL_LATEST,
+                        options: &unsupported,
+                    })
+                    .await?;

                info!(
                    ?version,
@@ -222,7 +235,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin + Send>(
                    session_type = "normal",
                    "successful handshake; unsupported minor version requested"
                );
-                break Ok(HandshakeData::Startup(stream, supported));
+                break Ok(HandshakeData::Startup(stream, params));
            }
            FeStartupPacket::StartupMessage { version, params } => {
                warn!(
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -10,14 +10,15 @@ pub(crate) mod wake_compute;
 use std::sync::Arc;

 pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
-use futures::FutureExt;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use smol_str::{SmolStr, ToSmolStr, format_smolstr};
 use thiserror::Error;
-use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, warn};

@@ -26,9 +27,8 @@ use self::passthrough::ProxyPassthrough;
 use crate::cancellation::{self, CancellationHandler};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
-use crate::error::{ReportableError, UserFacingError};
+use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams};
 use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
 use crate::proxy::handshake::{HandshakeData, handshake};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -38,18 +38,6 @@ use crate::{auth, compute};

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";

-#[derive(Error, Debug)]
-#[error("{ERR_INSECURE_CONNECTION}")]
-pub struct TlsRequired;
-
-impl ReportableError for TlsRequired {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        crate::error::ErrorKind::User
-    }
-}
-
-impl UserFacingError for TlsRequired {}
-
 pub async fn run_until_cancelled<F: std::future::Future>(
    f: F,
    cancellation_token: &CancellationToken,
@@ -270,7 +258,7 @@ impl ReportableError for ClientRequestError {
 }

 #[allow(clippy::too_many_arguments)]
-pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    auth_backend: &'static auth::Backend<'static, ()>,
    ctx: &RequestContext,
@@ -341,11 +329,11 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(

    let user_info = match result {
        Ok(user_info) => user_info,
-        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
+        Err(e) => stream.throw_error(e, Some(ctx)).await?,
    };

    let user = user_info.get_user().to_owned();
-    let user_info = match user_info
+    let (user_info, _ip_allowlist) = match user_info
        .authenticate(
            ctx,
            &mut stream,
@@ -361,10 +349,10 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
            let app = params.get("application_name");
            let params_span = tracing::info_span!("", ?user, ?db, ?app);

-            return Err(stream
+            return stream
                .throw_error(e, Some(ctx))
                .instrument(params_span)
-                .await)?;
+                .await?;
        }
    };

@@ -377,7 +365,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        .get(NeonOptions::PARAMS_COMPAT)
        .is_some();

-    let res = connect_to_compute(
+    let mut node = connect_to_compute(
        ctx,
        &TcpMechanism {
            user_info: compute_user_info.clone(),
@@ -389,19 +377,22 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
-    .await;
-
-    let node = match res {
-        Ok(node) => node,
-        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
-    };
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
+    .await?;

    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

    session.write_cancel_key(node.cancel_closure.clone())?;
-    prepare_client_connection(&node, *session.key(), &mut stream);
-    let stream = stream.flush_and_into_inner().await?;
+
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
+
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;

    let private_link_id = match ctx.extra() {
        Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
@@ -422,28 +413,31 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
-pub(crate) fn prepare_client_connection(
+#[tracing::instrument(skip_all)]
+pub(crate) async fn prepare_client_connection(
    node: &compute::PostgresConnection,
    cancel_key_data: CancelKeyData,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) {
+) -> Result<(), std::io::Error> {
    // Forward all deferred notices to the client.
    for notice in &node.delayed_notice {
-        stream.write_raw(notice.as_bytes().len(), b'N', |buf| {
-            buf.extend_from_slice(notice.as_bytes());
-        });
+        stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
    }

    // Forward all postgres connection params to the client.
    for (name, value) in &node.params {
-        stream.write_message(BeMessage::ParameterStatus {
+        stream.write_message_noflush(&Be::ParameterStatus {
            name: name.as_bytes(),
            value: value.as_bytes(),
-        });
+        })?;
    }

-    stream.write_message(BeMessage::BackendKeyData(cancel_key_data));
-    stream.write_message(BeMessage::ReadyForQuery);
+    stream
+        .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
+        .write_message(&Be::ReadyForQuery)
+        .await?;
+
+    Ok(())
 }

 #[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,3 @@
-use futures::FutureExt;
 use smol_str::SmolStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::debug;
@@ -90,7 +89,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
            .compute
            .cancel_closure
            .try_cancel_query(compute_config)
-            .boxed()
            .await
        {
            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -125,9 +125,8 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati

 #[cfg(test)]
 mod tests {
-    use postgres_client::error::{DbError, SqlState};
-
    use super::ShouldRetryWakeCompute;
+    use postgres_client::error::{DbError, SqlState};

    #[test]
    fn should_retry_wake_compute_for_db_error() {
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -10,7 +10,7 @@ use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_client::tls::TlsConnect;
 use postgres_protocol::message::frontend;
-use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream};
+use tokio::io::{AsyncReadExt, DuplexStream};
 use tokio_util::codec::{Decoder, Encoder};

 use super::*;
@@ -49,14 +49,15 @@ async fn proxy_mitm(
        };

        let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame);
-        let end_client = end_client.flush_and_into_inner().await.unwrap();
+        let (end_client, buf) = end_client.framed.into_inner();
+        assert!(buf.is_empty());
        let mut end_client = tokio_util::codec::Framed::new(end_client, PgFrame);

        // give the end_server the startup parameters
        let mut buf = BytesMut::new();
        frontend::startup_message(
            &postgres_protocol::message::frontend::StartupMessageParams {
-                params: startup.params.as_bytes().into(),
+                params: startup.params.into(),
            },
            &mut buf,
        )
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -26,7 +26,9 @@ use crate::auth::backend::{
 use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
-use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache};
+use crate::control_plane::{
+    self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache,
+};
 use crate::error::ErrorKind;
 use crate::tls::client_config::compute_client_config_with_certs;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
@@ -126,7 +128,7 @@ trait TestAuth: Sized {
        self,
        stream: &mut PqStream<Stream<S>>,
    ) -> anyhow::Result<()> {
-        stream.write_message(BeMessage::AuthenticationOk);
+        stream.write_message_noflush(&Be::AuthenticationOk)?;
        Ok(())
    }
 }
@@ -155,7 +157,9 @@ impl TestAuth for Scram {
        self,
        stream: &mut PqStream<Stream<S>>,
    ) -> anyhow::Result<()> {
-        let outcome = auth::AuthFlow::new(stream, auth::Scram(&self.0, &RequestContext::test()))
+        let outcome = auth::AuthFlow::new(stream)
+            .begin(auth::Scram(&self.0, &RequestContext::test()))
+            .await?
            .authenticate()
            .await?;

@@ -181,12 +185,10 @@ async fn dummy_proxy(

    auth.authenticate(&mut stream).await?;

-    stream.write_message(BeMessage::ParameterStatus {
-        name: b"client_encoding",
-        value: b"UTF8",
-    });
-    stream.write_message(BeMessage::ReadyForQuery);
-    stream.flush().await?;
+    stream
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
+        .write_message(&Be::ReadyForQuery)
+        .await?;

    Ok(())
 }
@@ -545,9 +547,20 @@ impl TestControlPlaneClient for TestConnectMechanism {
        }
    }

-    fn get_access_control(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_allowed_vpc_endpoint_ids(
        &self,
-    ) -> Result<control_plane::EndpointAccessControl, control_plane::errors::GetAuthInfoError> {
+    ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<control_plane::CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError>
+    {
        unimplemented!("not used in tests")
    }

--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -15,7 +15,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;

 pub struct LeakyBucketRateLimiter<Key> {
    map: ClashMap<Key, LeakyBucketState, RandomState>,
-    default_config: utils::leaky_bucket::LeakyBucketConfig,
+    config: utils::leaky_bucket::LeakyBucketConfig,
    access_count: AtomicUsize,
 }

@@ -28,17 +28,15 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
        Self {
            map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            default_config: config.into(),
+            config: config.into(),
            access_count: AtomicUsize::new(0),
        }
    }

    /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub(crate) fn check(&self, key: K, config: Option<LeakyBucketConfig>, n: u32) -> bool {
+    pub(crate) fn check(&self, key: K, n: u32) -> bool {
        let now = Instant::now();

-        let config = config.map_or(self.default_config, Into::into);
-
        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
            self.do_gc(now);
        }
@@ -48,7 +46,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
            .entry(key)
            .or_insert_with(|| LeakyBucketState { empty_at: now });

-        entry.add_tokens(&config, now, n as f64).is_ok()
+        entry.add_tokens(&self.config, now, n as f64).is_ok()
    }

    fn do_gc(&self, now: Instant) {
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -15,8 +15,6 @@ use tracing::info;
 use crate::ext::LockExt;
 use crate::intern::EndpointIdInt;

-use super::LeakyBucketConfig;
-
 pub struct GlobalRateLimiter {
    data: Vec<RateBucket>,
    info: Vec<RateBucketInfo>,
@@ -146,6 +144,19 @@ impl RateBucketInfo {
        Self::new(50_000, Duration::from_secs(10)),
    ];

+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+
    pub fn rps(&self) -> f64 {
        (self.max_rpi as f64) / self.interval.as_secs_f64()
    }
@@ -173,21 +184,6 @@ impl RateBucketInfo {
            max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32,
        }
    }
-
-    pub fn to_leaky_bucket(this: &[Self]) -> Option<LeakyBucketConfig> {
-        // bit of a hack - find the min rps and max rps supported and turn it into
-        // leaky bucket config instead
-
-        let mut iter = this.iter().map(|info| info.rps());
-        let first = iter.next()?;
-
-        let (min, max) = (first, first);
-        let (min, max) = iter.fold((min, max), |(min, max), rps| {
-            (f64::min(min, rps), f64::max(max, rps))
-        });
-
-        Some(LeakyBucketConfig { rps: min, max })
-    }
 }

 impl<K: Hash + Eq> BucketRateLimiter<K> {
--- a/proxy/src/rate_limiter/mod.rs
+++ b/proxy/src/rate_limiter/mod.rs
@@ -8,4 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd;
 pub(crate) use limit_algorithm::{
    DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
+pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,11 +1,10 @@
 use core::net::IpAddr;
 use std::sync::Arc;

+use pq_proto::CancelKeyData;
 use tokio::sync::Mutex;
 use uuid::Uuid;

-use crate::pqproto::CancelKeyData;
-
 pub trait CancellationPublisherMut: Send + Sync + 'static {
    #[allow(async_fn_in_trait)]
    async fn try_publish(
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,15 +1,16 @@
 use std::io::ErrorKind;

 use anyhow::Ok;
-
-use crate::pqproto::{CancelKeyData, id_to_cancel_key};
+use pq_proto::{CancelKeyData, id_to_cancel_key};
+use serde::{Deserialize, Serialize};

 pub mod keyspace {
    pub const CANCEL_PREFIX: &str = "cancel";
 }

-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) enum KeyPrefix {
+    #[serde(untagged)]
    Cancel(CancelKeyData),
 }

@@ -17,7 +18,9 @@ impl KeyPrefix {
    pub(crate) fn build_redis_key(&self) -> String {
        match self {
            KeyPrefix::Cancel(key) => {
-                let id = key.0.get();
+                let hi = (key.backend_pid as u64) << 32;
+                let lo = (key.cancel_key as u64) & 0xffff_ffff;
+                let id = hi | lo;
                let keyspace = keyspace::CANCEL_PREFIX;
                format!("{keyspace}:{id:x}")
            }
@@ -60,7 +63,10 @@ mod tests {

    #[test]
    fn test_build_redis_key() {
-        let cancel_key: KeyPrefix = KeyPrefix::Cancel(id_to_cancel_key(12345 << 32 | 54321));
+        let cancel_key: KeyPrefix = KeyPrefix::Cancel(CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        });

        let redis_key = cancel_key.build_redis_key();
        assert_eq!(redis_key, "cancel:30390000d431");
@@ -71,7 +77,10 @@ mod tests {
        let redis_key = "cancel:30390000d431";
        let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key");

-        let ref_key = id_to_cancel_key(12345 << 32 | 54321);
+        let ref_key = CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        };

        assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str());
        let KeyPrefix::Cancel(cancel_key) = key;
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -2,9 +2,11 @@ use std::convert::Infallible;
 use std::sync::Arc;

 use futures::StreamExt;
+use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
+use uuid::Uuid;

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
@@ -98,6 +100,14 @@ pub(crate) struct PasswordUpdate {
    role_name: RoleNameInt,
 }

+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct CancelSession {
+    pub(crate) region_id: Option<String>,
+    pub(crate) cancel_key_data: CancelKeyData,
+    pub(crate) session_id: Uuid,
+    pub(crate) peer_addr: Option<std::net::IpAddr>,
+}
+
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
    T: for<'de2> serde::Deserialize<'de2>,
@@ -233,30 +243,29 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {

 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
    match msg {
-        Notification::AllowedIpsUpdate {
-            allowed_ips_update: AllowedIpsUpdate { project_id },
+        Notification::AllowedIpsUpdate { allowed_ips_update } => {
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
        }
-        | Notification::BlockPublicOrVpcAccessUpdated {
-            block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated { project_id },
-        } => cache.invalidate_endpoint_access_for_project(project_id),
+        Notification::BlockPublicOrVpcAccessUpdated {
+            block_public_or_vpc_access_updated,
+        } => cache.invalidate_block_public_or_vpc_access_for_project(
+            block_public_or_vpc_access_updated.project_id,
+        ),
        Notification::AllowedVpcEndpointsUpdatedForOrg {
-            allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg { account_id },
-        } => cache.invalidate_endpoint_access_for_org(account_id),
+            allowed_vpc_endpoints_updated_for_org,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
+            allowed_vpc_endpoints_updated_for_org.account_id,
+        ),
        Notification::AllowedVpcEndpointsUpdatedForProjects {
-            allowed_vpc_endpoints_updated_for_projects:
-                AllowedVpcEndpointsUpdatedForProjects { project_ids },
-        } => {
-            for project in project_ids {
-                cache.invalidate_endpoint_access_for_project(project);
-            }
-        }
-        Notification::PasswordUpdate {
-            password_update:
-                PasswordUpdate {
-                    project_id,
-                    role_name,
-                },
-        } => cache.invalidate_role_secret_for_project(project_id, role_name),
+            allowed_vpc_endpoints_updated_for_projects,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
+            allowed_vpc_endpoints_updated_for_projects.project_ids,
+        ),
+        Notification::PasswordUpdate { password_update } => cache
+            .invalidate_role_secret_for_project(
+                password_update.project_id,
+                password_update.role_name,
+            ),
        Notification::UnknownTopic => unreachable!(),
    }
 }
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -1,5 +1,7 @@
 //! Definitions for SASL messages.

+use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
+
 use crate::parse::split_cstr;

 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
@@ -28,6 +30,26 @@ impl<'a> FirstMessage<'a> {
    }
 }

+/// A single SASL message.
+/// This struct is deliberately decoupled from lower-level
+/// [`BeAuthenticationSaslMessage`].
+#[derive(Debug)]
+pub(super) enum ServerMessage<T> {
+    /// We expect to see more steps.
+    Continue(T),
+    /// This is the final step.
+    Final(T),
+}
+
+impl<'a> ServerMessage<&'a str> {
+    pub(super) fn to_reply(&self) -> BeMessage<'a> {
+        BeMessage::AuthenticationSasl(match self {
+            ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
+            ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/sasl/mod.rs
+++ b/proxy/src/sasl/mod.rs
@@ -14,7 +14,7 @@ use std::io;

 pub(crate) use channel_binding::ChannelBinding;
 pub(crate) use messages::FirstMessage;
-pub(crate) use stream::{Outcome, authenticate};
+pub(crate) use stream::{Outcome, SaslStream};
 use thiserror::Error;

 use crate::error::{ReportableError, UserFacingError};
@@ -22,9 +22,6 @@ use crate::error::{ReportableError, UserFacingError};
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
 pub(crate) enum Error {
-    #[error("Unsupported authentication method: {0}")]
-    BadAuthMethod(Box<str>),
-
    #[error("Channel binding failed: {0}")]
    ChannelBindingFailed(&'static str),

@@ -57,7 +54,6 @@ impl UserFacingError for Error {
 impl ReportableError for Error {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
        match self {
-            Error::BadAuthMethod(_) => crate::error::ErrorKind::User,
            Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -3,12 +3,61 @@
 use std::io;

 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;

-use super::{Mechanism, Step};
-use crate::context::RequestContext;
-use crate::pqproto::{BeAuthenticationSaslMessage, BeMessage};
+use super::Mechanism;
+use super::messages::ServerMessage;
 use crate::stream::PqStream;

+/// Abstracts away all peculiarities of the libpq's protocol.
+pub(crate) struct SaslStream<'a, S> {
+    /// The underlying stream.
+    stream: &'a mut PqStream<S>,
+    /// Current password message we received from client.
+    current: bytes::Bytes,
+    /// First SASL message produced by client.
+    first: Option<&'a str>,
+}
+
+impl<'a, S> SaslStream<'a, S> {
+    pub(crate) fn new(stream: &'a mut PqStream<S>, first: &'a str) -> Self {
+        Self {
+            stream,
+            current: bytes::Bytes::new(),
+            first: Some(first),
+        }
+    }
+}
+
+impl<S: AsyncRead + Unpin> SaslStream<'_, S> {
+    // Receive a new SASL message from the client.
+    async fn recv(&mut self) -> io::Result<&str> {
+        if let Some(first) = self.first.take() {
+            return Ok(first);
+        }
+
+        self.current = self.stream.read_password_message().await?;
+        let s = std::str::from_utf8(&self.current)
+            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "bad encoding"))?;
+
+        Ok(s)
+    }
+}
+
+impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
+    // Send a SASL message to the client.
+    async fn send(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> {
+        self.stream.write_message(&msg.to_reply()).await?;
+        Ok(())
+    }
+
+    // Queue a SASL message for the client.
+    fn send_noflush(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> {
+        self.stream.write_message_noflush(&msg.to_reply())?;
+        Ok(())
+    }
+}
+
 /// SASL authentication outcome.
 /// It's much easier to match on those two variants
 /// than to peek into a noisy protocol error type.
@@ -20,63 +69,33 @@ pub(crate) enum Outcome<R> {
    Failure(&'static str),
 }

-pub async fn authenticate<S, F, M>(
-    ctx: &RequestContext,
-    stream: &mut PqStream<S>,
-    mechanism: F,
-) -> super::Result<Outcome<M::Output>>
-where
-    S: AsyncRead + AsyncWrite + Unpin,
-    F: FnOnce(&str) -> super::Result<M>,
-    M: Mechanism,
-{
-    let (mut mechanism, mut input) = {
-        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
+    /// Perform SASL message exchange according to the underlying algorithm
+    /// until user is either authenticated or denied access.
+    pub(crate) async fn authenticate<M: Mechanism>(
+        mut self,
+        mut mechanism: M,
+    ) -> super::Result<Outcome<M::Output>> {
+        loop {
+            let input = self.recv().await?;
+            let step = mechanism.exchange(input).map_err(|error| {
+                info!(?error, "error during SASL exchange");
+                error
+            })?;

-        // Initial client message contains the chosen auth method's name.
-        let msg = stream.read_password_message().await?;
-
-        let sasl = super::FirstMessage::parse(msg)
-            .ok_or(super::Error::BadClientMessage("bad sasl message"))?;
-
-        (mechanism(sasl.method)?, sasl.message)
-    };
-
-    loop {
-        match mechanism.exchange(input) {
-            Ok(Step::Continue(moved_mechanism, reply)) => {
-                mechanism = moved_mechanism;
-
-                // write reply
-                let sasl_msg = BeAuthenticationSaslMessage::Continue(reply.as_bytes());
-                stream.write_message(BeMessage::AuthenticationSasl(sasl_msg));
-                drop(reply);
-            }
-            Ok(Step::Success(result, reply)) => {
-                // write reply
-                let sasl_msg = BeAuthenticationSaslMessage::Final(reply.as_bytes());
-                stream.write_message(BeMessage::AuthenticationSasl(sasl_msg));
-                stream.write_message(BeMessage::AuthenticationOk);
-
-                // exit with success
-                break Ok(Outcome::Success(result));
-            }
-            // exit with failure
-            Ok(Step::Failure(reason)) => break Ok(Outcome::Failure(reason)),
-            Err(error) => {
-                tracing::info!(?error, "error during SASL exchange");
-                return Err(error);
-            }
+            use super::Step;
+            return Ok(match step {
+                Step::Continue(moved_mechanism, reply) => {
+                    self.send(&ServerMessage::Continue(&reply)).await?;
+                    mechanism = moved_mechanism;
+                    continue;
+                }
+                Step::Success(result, reply) => {
+                    self.send_noflush(&ServerMessage::Final(&reply))?;
+                    Outcome::Success(result)
+                }
+                Step::Failure(reason) => Outcome::Failure(reason),
+            });
        }
-
-        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
-
-        // get next input
-        stream.flush().await?;
-        let msg = stream.read_password_message().await?;
-        input = std::str::from_utf8(msg)
-            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "bad encoding"))?;
    }
 }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -22,7 +22,7 @@ use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
 use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
-use crate::auth::{self, AuthError};
+use crate::auth::{self, AuthError, check_peer_addr_is_in_list};
 use crate::compute;
 use crate::compute_ctl::{
    ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
@@ -35,6 +35,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -62,24 +63,63 @@ impl PoolingBackend {

        let user_info = user_info.clone();
        let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
-        let access_control = backend.get_endpoint_access_control(ctx).await?;
-        access_control.check(
-            ctx,
-            self.config.authentication_config.ip_allowlist_check_enabled,
-            self.config.authentication_config.is_vpc_acccess_proxy,
-        )?;
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;

-        let ep = EndpointIdInt::from(&user_info.endpoint);
-        let rate_limit_config = None;
-        if !self.endpoint_rate_limiter.check(ep, rate_limit_config, 1) {
+        if self.config.authentication_config.ip_allowlist_check_enabled
+            && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
+        {
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+
+        let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?;
+        if self.config.authentication_config.is_vpc_acccess_proxy {
+            if access_blocker_flags.vpc_access_blocked {
+                return Err(AuthError::NetworkNotAllowed);
+            }
+
+            let extra = ctx.extra();
+            let incoming_endpoint_id = match extra {
+                None => String::new(),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            if incoming_endpoint_id.is_empty() {
+                return Err(AuthError::MissingVPCEndpointId);
+            }
+
+            let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id)
+            {
+                return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id));
+            }
+        } else if access_blocker_flags.public_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
+        if !self
+            .endpoint_rate_limiter
+            .check(user_info.endpoint.clone().into(), 1)
+        {
            return Err(AuthError::too_many_connections());
        }
-        let role_access = backend.get_role_secret(ctx).await?;
-        let Some(secret) = role_access.secret else {
-            // If we don't have an authentication secret, for the http flow we can just return an error.
-            info!("authentication info not found");
-            return Err(AuthError::password_failed(&*user_info.user));
+        let cached_secret = backend.get_role_secret(ctx).await?;
+        let secret = match cached_secret.value.clone() {
+            Some(secret) => self.config.authentication_config.check_rate_limit(
+                ctx,
+                secret,
+                &user_info.endpoint,
+                true,
+            )?,
+            None => {
+                // If we don't have an authentication secret, for the http flow we can just return an error.
+                info!("authentication info not found");
+                return Err(AuthError::password_failed(&*user_info.user));
+            }
        };
+        let ep = EndpointIdInt::from(&user_info.endpoint);
        let auth_outcome = crate::auth::validate_password_and_exchange(
            &self.config.authentication_config.thread_pool,
            ep,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,6 +17,7 @@ use postgres_client::error::{DbError, ErrorPosition, SqlState};
 use postgres_client::{
    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
 };
+use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use serde_json::value::RawValue;
@@ -40,7 +41,6 @@ use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::http::{ReadBodyError, read_body_with_limit};
 use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind};
-use crate::pqproto::StartupMessageParams;
 use crate::proxy::{NeonOptions, run_until_cancelled};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
@@ -219,7 +219,7 @@ fn get_conn_info(

    let mut options = Option::None;

-    let mut params = StartupMessageParams::default();
+    let mut params = StartupMessageParamsBuilder::default();
    params.insert("user", &username);
    params.insert("database", &dbname);
    for (key, value) in pairs {
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -2,17 +2,19 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::{io, task};

+use bytes::BytesMut;
+use pq_proto::framed::{ConnectionError, Framed};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
 use rustls::ServerConfig;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
+use tracing::debug;

+use crate::control_plane::messages::ColdStartInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::Metrics;
-use crate::pqproto::{
-    BeMessage, FE_PASSWORD_MESSAGE, FeStartupPacket, SQLSTATE_INTERNAL_ERROR, WriteBuf,
-    read_message, read_startup,
-};
 use crate::tls::TlsServerEndPoint;

 /// Stream wrapper which implements libpq's protocol.
@@ -21,77 +23,58 @@ use crate::tls::TlsServerEndPoint;
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
 pub struct PqStream<S> {
-    stream: S,
-    read: Vec<u8>,
-    write: WriteBuf,
+    pub(crate) framed: Framed<S>,
 }

 impl<S> PqStream<S> {
-    pub fn get_ref(&self) -> &S {
-        &self.stream
+    /// Construct a new libpq protocol wrapper.
+    pub fn new(stream: S) -> Self {
+        Self {
+            framed: Framed::new(stream),
+        }
    }

-    /// Construct a new libpq protocol wrapper over a stream without the first startup message.
-    #[cfg(test)]
-    pub fn new_skip_handshake(stream: S) -> Self {
-        Self {
-            stream,
-            read: Vec::new(),
-            write: WriteBuf::new(),
-        }
+    /// Extract the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        self.framed.into_inner()
+    }
+
+    /// Get a shared reference to the underlying stream.
+    pub(crate) fn get_ref(&self) -> &S {
+        self.framed.get_ref()
    }
 }

-impl<S: AsyncRead + AsyncWrite + Unpin> PqStream<S> {
-    /// Construct a new libpq protocol wrapper and read the first startup message.
-    ///
-    /// This is not cancel safe.
-    pub async fn parse_startup(mut stream: S) -> io::Result<(Self, FeStartupPacket)> {
-        let startup = read_startup(&mut stream).await?;
-        Ok((
-            Self {
-                stream,
-                read: Vec::new(),
-                write: WriteBuf::new(),
-            },
-            startup,
-        ))
-    }
-
-    /// Tell the client that encryption is not supported.
-    ///
-    /// This is not cancel safe
-    pub async fn reject_encryption(&mut self) -> io::Result<FeStartupPacket> {
-        // N for No.
-        self.write.encryption(b'N');
-        self.flush().await?;
-        read_startup(&mut self.stream).await
-    }
+fn err_connection() -> io::Error {
+    io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
 }

 impl<S: AsyncRead + Unpin> PqStream<S> {
-    /// Read a raw postgres packet, which will respect the max length requested.
-    /// This is not cancel safe.
-    async fn read_raw_expect(&mut self, tag: u8, max: u32) -> io::Result<&mut [u8]> {
-        let (actual_tag, msg) = read_message(&mut self.stream, &mut self.read, max).await?;
-        if actual_tag != tag {
-            return Err(io::Error::other(format!(
-                "incorrect message tag, expected {:?}, got {:?}",
-                tag as char, actual_tag as char,
-            )));
-        }
-        Ok(msg)
+    /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
+    pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
+        self.framed
+            .read_startup_message()
+            .await
+            .map_err(ConnectionError::into_io_error)?
+            .ok_or_else(err_connection)
    }

-    /// Read a postgres password message, which will respect the max length requested.
-    /// This is not cancel safe.
-    pub async fn read_password_message(&mut self) -> io::Result<&mut [u8]> {
-        // passwords are usually pretty short
-        // and SASL SCRAM messages are no longer than 256 bytes in my testing
-        // (a few hashes and random bytes, encoded into base64).
-        const MAX_PASSWORD_LENGTH: u32 = 512;
-        self.read_raw_expect(FE_PASSWORD_MESSAGE, MAX_PASSWORD_LENGTH)
+    async fn read_message(&mut self) -> io::Result<FeMessage> {
+        self.framed
+            .read_message()
            .await
+            .map_err(ConnectionError::into_io_error)?
+            .ok_or_else(err_connection)
+    }
+
+    pub(crate) async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
+        match self.read_message().await? {
+            FeMessage::PasswordMessage(msg) => Ok(msg),
+            bad => Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("unexpected message type: {bad:?}"),
+            )),
+        }
    }
 }

@@ -101,16 +84,6 @@ pub struct ReportedError {
    error_kind: ErrorKind,
 }

-impl ReportedError {
-    pub fn new(e: (impl UserFacingError + Into<anyhow::Error>)) -> Self {
-        let error_kind = e.get_error_kind();
-        Self {
-            source: e.into(),
-            error_kind,
-        }
-    }
-}
-
 impl std::fmt::Display for ReportedError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.source.fmt(f)
@@ -129,65 +102,109 @@ impl ReportableError for ReportedError {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+enum ErrorTag {
+    #[serde(rename = "proxy")]
+    Proxy,
+    #[serde(rename = "compute")]
+    Compute,
+    #[serde(rename = "client")]
+    Client,
+    #[serde(rename = "controlplane")]
+    ControlPlane,
+    #[serde(rename = "other")]
+    Other,
+}
+
+impl From<ErrorKind> for ErrorTag {
+    fn from(error_kind: ErrorKind) -> Self {
+        match error_kind {
+            ErrorKind::User => Self::Client,
+            ErrorKind::ClientDisconnect => Self::Client,
+            ErrorKind::RateLimit => Self::Proxy,
+            ErrorKind::ServiceRateLimit => Self::Proxy, // considering rate limit as proxy error for SLI
+            ErrorKind::Quota => Self::Proxy,
+            ErrorKind::Service => Self::Proxy,
+            ErrorKind::ControlPlane => Self::ControlPlane,
+            ErrorKind::Postgres => Self::Other,
+            ErrorKind::Compute => Self::Compute,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(rename_all = "snake_case")]
+struct ProbeErrorData {
+    tag: ErrorTag,
+    msg: String,
+    cold_start_info: Option<ColdStartInfo>,
+}
+
 impl<S: AsyncWrite + Unpin> PqStream<S> {
-    /// Tell the client that we are willing to accept SSL.
-    /// This is not cancel safe
-    pub async fn accept_tls(mut self) -> io::Result<S> {
-        // S for SSL.
-        self.write.encryption(b'S');
-        self.flush().await?;
-        Ok(self.stream)
-    }
-
-    /// Assert that we are using direct TLS.
-    pub fn accept_direct_tls(self) -> S {
-        self.stream
-    }
-
-    /// Write a raw message to the internal buffer.
-    pub fn write_raw(&mut self, size_hint: usize, tag: u8, f: impl FnOnce(&mut Vec<u8>)) {
-        self.write.write_raw(size_hint, tag, f);
-    }
-
-    /// Write the message into an internal buffer
-    pub fn write_message(&mut self, message: BeMessage<'_>) {
-        message.write_message(&mut self.write);
-    }
-
-    /// Flush the output buffer into the underlying stream.
-    ///
-    /// This is cancel safe.
-    pub async fn flush(&mut self) -> io::Result<()> {
-        self.stream.write_all_buf(&mut self.write).await?;
-        self.write.reset();
-
-        self.stream.flush().await?;
-
-        Ok(())
-    }
-
-    /// Flush the output buffer into the underlying stream.
-    ///
-    /// This is cancel safe.
-    pub async fn flush_and_into_inner(mut self) -> io::Result<S> {
-        self.flush().await?;
-        Ok(self.stream)
-    }
-
-    /// Write the error message to the client, then re-throw it.
-    ///
-    /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
-    pub(crate) async fn throw_error<E>(
+    /// Write the message into an internal buffer, but don't flush the underlying stream.
+    pub(crate) fn write_message_noflush(
        &mut self,
-        error: E,
+        message: &BeMessage<'_>,
+    ) -> io::Result<&mut Self> {
+        self.framed
+            .write_message(message)
+            .map_err(ProtocolError::into_io_error)?;
+        Ok(self)
+    }
+
+    /// Write the message into an internal buffer and flush it.
+    pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
+        self.write_message_noflush(message)?;
+        self.flush().await?;
+        Ok(self)
+    }
+
+    /// Flush the output buffer into the underlying stream.
+    pub(crate) async fn flush(&mut self) -> io::Result<&mut Self> {
+        self.framed.flush().await?;
+        Ok(self)
+    }
+
+    /// Writes message with the given error kind to the stream.
+    /// Used only for probe queries
+    async fn write_format_message(
+        &mut self,
+        msg: &str,
+        error_kind: ErrorKind,
        ctx: Option<&crate::context::RequestContext>,
-    ) -> ReportedError
-    where
-        E: UserFacingError + Into<anyhow::Error>,
-    {
-        let error_kind = error.get_error_kind();
-        let msg = error.to_string_client();
+    ) -> String {
+        let formatted_msg = match ctx {
+            Some(ctx) if ctx.get_testodrome_id().is_some() => {
+                serde_json::to_string(&ProbeErrorData {
+                    tag: ErrorTag::from(error_kind),
+                    msg: msg.to_string(),
+                    cold_start_info: Some(ctx.cold_start_info()),
+                })
+                .unwrap_or_default()
+            }
+            _ => msg.to_string(),
+        };
+
+        // already error case, ignore client IO error
+        self.write_message(&BeMessage::ErrorResponse(&formatted_msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
+
+        formatted_msg
+    }
+
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
+    /// Allowing string literals is safe under the assumption they might not contain any runtime info.
+    /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
+    pub async fn throw_error_str<T>(
+        &mut self,
+        msg: &'static str,
+        error_kind: ErrorKind,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> Result<T, ReportedError> {
+        self.write_format_message(msg, error_kind, ctx).await;

        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
            tracing::info!(
@@ -197,39 +214,39 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
            );
        }

-        let probe_msg;
-        let mut msg = &*msg;
-        if let Some(ctx) = ctx {
-            if ctx.get_testodrome_id().is_some() {
-                let tag = match error_kind {
-                    ErrorKind::User => "client",
-                    ErrorKind::ClientDisconnect => "client",
-                    ErrorKind::RateLimit => "proxy",
-                    ErrorKind::ServiceRateLimit => "proxy",
-                    ErrorKind::Quota => "proxy",
-                    ErrorKind::Service => "proxy",
-                    ErrorKind::ControlPlane => "controlplane",
-                    ErrorKind::Postgres => "other",
-                    ErrorKind::Compute => "compute",
-                };
-                probe_msg = typed_json::json!({
-                    "tag": tag,
-                    "msg": msg,
-                    "cold_start_info": ctx.cold_start_info(),
-                })
-                .to_string();
-                msg = &probe_msg;
-            }
+        Err(ReportedError {
+            source: anyhow::anyhow!(msg),
+            error_kind,
+        })
+    }
+
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
+    /// Trait [`UserFacingError`] acts as an allowlist for error types.
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
+    pub(crate) async fn throw_error<T, E>(
+        &mut self,
+        error: E,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> Result<T, ReportedError>
+    where
+        E: UserFacingError + Into<anyhow::Error>,
+    {
+        let error_kind = error.get_error_kind();
+        let msg = error.to_string_client();
+        self.write_format_message(&msg, error_kind, ctx).await;
+        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%error,
+                msg,
+                "forwarding error to user",
+            );
        }

-        // TODO: either preserve the error code from postgres, or assign error codes to proxy errors.
-        self.write.write_error(msg, SQLSTATE_INTERNAL_ERROR);
-
-        self.flush()
-            .await
-            .unwrap_or_else(|e| tracing::debug!("write_message failed: {e}"));
-
-        ReportedError::new(error)
+        Err(ReportedError {
+            source: anyhow::anyhow!(error),
+            error_kind,
+        })
    }
 }

--- a/proxy/src/tls/postgres_rustls.rs
+++ b/proxy/src/tls/postgres_rustls.rs
@@ -31,9 +31,7 @@ mod private {
        type Output = io::Result<RustlsStream<S>>;

        fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-            Pin::new(&mut self.inner)
-                .poll(cx)
-                .map_ok(|s| RustlsStream(Box::new(s)))
+            Pin::new(&mut self.inner).poll(cx).map_ok(RustlsStream)
        }
    }

@@ -59,7 +57,7 @@ mod private {
        }
    }

-    pub struct RustlsStream<S>(Box<TlsStream<S>>);
+    pub struct RustlsStream<S>(TlsStream<S>);

    impl<S> postgres_client::tls::TlsStream for RustlsStream<S>
    where
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -44,7 +44,6 @@ struct GlobalTimelinesState {
    // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
    // this map is dropped on restart.
    tombstones: HashMap<TenantTimelineId, Instant>,
-    tenant_tombstones: HashMap<TenantId, Instant>,

    conf: Arc<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
@@ -82,25 +81,10 @@ impl GlobalTimelinesState {
        }
    }

-    fn has_tombstone(&self, ttid: &TenantTimelineId) -> bool {
-        self.tombstones.contains_key(ttid) || self.tenant_tombstones.contains_key(&ttid.tenant_id)
-    }
-
-    /// Removes all blocking tombstones for the given timeline ID.
-    /// Returns `true` if there have been actual changes.
-    fn remove_tombstone(&mut self, ttid: &TenantTimelineId) -> bool {
-        self.tombstones.remove(ttid).is_some()
-            || self.tenant_tombstones.remove(&ttid.tenant_id).is_some()
-    }
-
    fn delete(&mut self, ttid: TenantTimelineId) {
        self.timelines.remove(&ttid);
        self.tombstones.insert(ttid, Instant::now());
    }
-
-    fn add_tenant_tombstone(&mut self, tenant_id: TenantId) {
-        self.tenant_tombstones.insert(tenant_id, Instant::now());
-    }
 }

 /// A struct used to manage access to the global timelines map.
@@ -115,7 +99,6 @@ impl GlobalTimelines {
            state: Mutex::new(GlobalTimelinesState {
                timelines: HashMap::new(),
                tombstones: HashMap::new(),
-                tenant_tombstones: HashMap::new(),
                conf,
                broker_active_set: Arc::new(TimelinesSet::default()),
                global_rate_limiter: RateLimiter::new(1, 1),
@@ -262,7 +245,7 @@ impl GlobalTimelines {
                return Ok(timeline);
            }

-            if state.has_tombstone(&ttid) {
+            if state.tombstones.contains_key(&ttid) {
                anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
            }

@@ -312,14 +295,13 @@ impl GlobalTimelines {
                _ => {}
            }
            if check_tombstone {
-                if state.has_tombstone(&ttid) {
+                if state.tombstones.contains_key(&ttid) {
                    anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
                }
            } else {
                // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
                // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
-                // It's also possible that we enter this when the tenant has been deleted, even if the timeline itself has never existed.
-                if state.remove_tombstone(&ttid) {
+                if state.tombstones.remove(&ttid).is_some() {
                    warn!("un-deleted timeline {ttid}");
                }
            }
@@ -500,7 +482,6 @@ impl GlobalTimelines {
        let tli_res = {
            let state = self.state.lock().unwrap();

-            // Do NOT check tenant tombstones here: those were set earlier
            if state.tombstones.contains_key(ttid) {
                // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
                info!("Timeline {ttid} was already deleted");
@@ -576,10 +557,6 @@ impl GlobalTimelines {
        action: DeleteOrExclude,
    ) -> Result<HashMap<TenantTimelineId, TimelineDeleteResult>> {
        info!("deleting all timelines for tenant {}", tenant_id);
-
-        // Adding a tombstone before getting the timelines to prevent new timeline additions
-        self.state.lock().unwrap().add_tenant_tombstone(*tenant_id);
-
        let to_delete = self.get_all_for_tenant(*tenant_id);

        let mut err = None;
@@ -623,9 +600,6 @@ impl GlobalTimelines {
        state
            .tombstones
            .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
-        state
-            .tenant_tombstones
-            .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
    }
 }

--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -482,10 +482,6 @@ async fn handle_tenant_timeline_delete(
        ForwardOutcome::NotForwarded(_req) => {}
    };

-    service
-        .maybe_delete_timeline_import(tenant_id, timeline_id)
-        .await?;
-
    // For timeline deletions, which both implement an "initially return 202, then 404 once
    // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
    async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -139,14 +139,6 @@ pub(crate) struct StorageControllerMetricGroup {
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_reconcile_long_running:
        measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
-
-    /// Indicator of safekeeper reconciler queue depth, broken down by safekeeper, excluding ongoing reconciles.
-    pub(crate) storage_controller_safkeeper_reconciles_queued:
-        measured::GaugeVec<SafekeeperReconcilerLabelGroupSet>,
-
-    /// Indicator of completed safekeeper reconciles, broken down by safekeeper.
-    pub(crate) storage_controller_safkeeper_reconciles_complete:
-        measured::CounterVec<SafekeeperReconcilerLabelGroupSet>,
 }

 impl StorageControllerMetrics {
@@ -265,17 +257,6 @@ pub(crate) enum Method {
    Other,
 }

-#[derive(measured::LabelGroup, Clone)]
-#[label(set = SafekeeperReconcilerLabelGroupSet)]
-pub(crate) struct SafekeeperReconcilerLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
-    pub(crate) sk_az: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
-    pub(crate) sk_node_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
-    pub(crate) sk_hostname: &'a str,
-}
-
 impl From<hyper::Method> for Method {
    fn from(value: hyper::Method) -> Self {
        if value == hyper::Method::GET {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -99,8 +99,8 @@ use crate::tenant_shard::{
    ScheduleOptimization, ScheduleOptimizationAction, TenantShard,
 };
 use crate::timeline_import::{
-    FinalizingImport, ImportResult, ShardImportStatuses, TimelineImport,
-    TimelineImportFinalizeError, TimelineImportState, UpcallClient,
+    ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError,
+    TimelineImportState, UpcallClient,
 };

 const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
@@ -232,9 +232,6 @@ struct ServiceState {

    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
-
-    /// Tracks ongoing timeline import finalization tasks
-    imports_finalizing: BTreeMap<(TenantId, TimelineId), FinalizingImport>,
 }

 /// Transform an error from a pageserver into an error to return to callers of a storage
@@ -311,7 +308,6 @@ impl ServiceState {
            scheduler,
            ongoing_operation: None,
            delayed_reconcile_rx,
-            imports_finalizing: Default::default(),
        }
    }

@@ -4101,58 +4097,13 @@ impl Service {
    ///
    /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going
    /// imports are stored in the database).
-    ///
-    /// # Cancel-Safety
-    /// Not cancel safe.
-    /// If the caller stops polling, the import will not be removed from
-    /// [`ServiceState::imports_finalizing`].
    #[instrument(skip_all, fields(
        tenant_id=%import.tenant_id,
        timeline_id=%import.timeline_id,
    ))]
-
    async fn finalize_timeline_import(
        self: &Arc<Self>,
        import: TimelineImport,
-    ) -> Result<(), TimelineImportFinalizeError> {
-        let tenant_timeline = (import.tenant_id, import.timeline_id);
-
-        let (_finalize_import_guard, cancel) = {
-            let mut locked = self.inner.write().unwrap();
-            let gate = Gate::default();
-            let cancel = CancellationToken::default();
-
-            let guard = gate.enter().unwrap();
-
-            locked.imports_finalizing.insert(
-                tenant_timeline,
-                FinalizingImport {
-                    gate,
-                    cancel: cancel.clone(),
-                },
-            );
-
-            (guard, cancel)
-        };
-
-        let res = tokio::select! {
-            res = self.finalize_timeline_import_impl(import) => {
-                res
-            },
-            _ = cancel.cancelled() => {
-                Err(TimelineImportFinalizeError::Cancelled)
-            }
-        };
-
-        let mut locked = self.inner.write().unwrap();
-        locked.imports_finalizing.remove(&tenant_timeline);
-
-        res
-    }
-
-    async fn finalize_timeline_import_impl(
-        self: &Arc<Self>,
-        import: TimelineImport,
    ) -> Result<(), TimelineImportFinalizeError> {
        tracing::info!("Finalizing timeline import");

@@ -4352,46 +4303,6 @@ impl Service {
        .await;
    }

-    /// Delete a timeline import if it exists
-    ///
-    /// Firstly, delete the entry from the database. Any updates
-    /// from pageservers after the update will fail with a 404, so the
-    /// import cannot progress into finalizing state if it's not there already.
-    /// Secondly, cancel the finalization if one is in progress.
-    pub(crate) async fn maybe_delete_timeline_import(
-        self: &Arc<Self>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<(), DatabaseError> {
-        let tenant_has_ongoing_import = {
-            let locked = self.inner.read().unwrap();
-            locked
-                .tenants
-                .range(TenantShardId::tenant_range(tenant_id))
-                .any(|(_tid, shard)| shard.importing == TimelineImportState::Importing)
-        };
-
-        if !tenant_has_ongoing_import {
-            return Ok(());
-        }
-
-        self.persistence
-            .delete_timeline_import(tenant_id, timeline_id)
-            .await?;
-
-        let maybe_finalizing = {
-            let mut locked = self.inner.write().unwrap();
-            locked.imports_finalizing.remove(&(tenant_id, timeline_id))
-        };
-
-        if let Some(finalizing) = maybe_finalizing {
-            finalizing.cancel.cancel();
-            finalizing.gate.close().await;
-        }
-
-        Ok(())
-    }
-
    pub(crate) async fn tenant_timeline_archival_config(
        &self,
        tenant_id: TenantId,
@@ -8627,9 +8538,8 @@ impl Service {
        Some(ShardCount(new_shard_count))
    }

-    /// Fetches the top tenant shards from every available node, in descending order of
-    /// max logical size. Offline nodes are skipped, and any errors from available nodes
-    /// will be logged and ignored.
+    /// Fetches the top tenant shards from every node, in descending order of
+    /// max logical size. Any node errors will be logged and ignored.
    async fn get_top_tenant_shards(
        &self,
        request: &TopTenantShardsRequest,
@@ -8640,7 +8550,6 @@ impl Service {
            .unwrap()
            .nodes
            .values()
-            .filter(|node| node.is_available())
            .cloned()
            .collect_vec();

--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -20,9 +20,7 @@ use utils::{
 };

 use crate::{
-    metrics::{METRICS_REGISTRY, SafekeeperReconcilerLabelGroup},
-    persistence::SafekeeperTimelineOpKind,
-    safekeeper::Safekeeper,
+    persistence::SafekeeperTimelineOpKind, safekeeper::Safekeeper,
    safekeeper_client::SafekeeperClient,
 };

@@ -220,26 +218,7 @@ impl ReconcilerHandle {
    fn schedule_reconcile(&self, req: ScheduleRequest) {
        let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
        let hostname = req.safekeeper.skp.host.clone();
-        let sk_az = req.safekeeper.skp.availability_zone_id.clone();
-        let sk_node_id = req.safekeeper.get_id().to_string();
-
-        // We don't have direct access to the queue depth here, so increase it blindly by 1.
-        // We know that putting into the queue increases the queue depth. The receiver will
-        // update with the correct value once it processes the next item. To avoid races where we
-        // reduce before we increase, leaving the gauge with a 1 value for a long time, we
-        // increase it before putting into the queue.
-        let queued_gauge = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_safkeeper_reconciles_queued;
-        let label_group = SafekeeperReconcilerLabelGroup {
-            sk_az: &sk_az,
-            sk_node_id: &sk_node_id,
-            sk_hostname: &hostname,
-        };
-        queued_gauge.inc(label_group.clone());
-
        if let Err(err) = self.tx.send((req, cancel, token_id)) {
-            queued_gauge.set(label_group, 0);
            tracing::info!("scheduling request onto {hostname} returned error: {err}");
        }
    }
@@ -304,18 +283,6 @@ impl SafekeeperReconciler {
                continue;
            }

-            let queued_gauge = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_safkeeper_reconciles_queued;
-            queued_gauge.set(
-                SafekeeperReconcilerLabelGroup {
-                    sk_az: &req.safekeeper.skp.availability_zone_id,
-                    sk_node_id: &req.safekeeper.get_id().to_string(),
-                    sk_hostname: &req.safekeeper.skp.host,
-                },
-                self.rx.len() as i64,
-            );
-
            tokio::task::spawn(async move {
                let kind = req.kind;
                let tenant_id = req.tenant_id;
@@ -544,16 +511,6 @@ impl SafekeeperReconcilerInner {
                            req.generation,
                        )
                        .await;
-
-                    let complete_counter = &METRICS_REGISTRY
-                        .metrics_group
-                        .storage_controller_safkeeper_reconciles_complete;
-                    complete_counter.inc(SafekeeperReconcilerLabelGroup {
-                        sk_az: &req.safekeeper.skp.availability_zone_id,
-                        sk_node_id: &req.safekeeper.get_id().to_string(),
-                        sk_hostname: &req.safekeeper.skp.host,
-                    });
-
                    if let Err(err) = res {
                        tracing::info!(
                            "couldn't remove reconciliation request onto {} from persistence: {err:?}",
--- a/storage_controller/src/timeline_import.rs
+++ b/storage_controller/src/timeline_import.rs
@@ -7,7 +7,6 @@ use serde::{Deserialize, Serialize};

 use pageserver_api::models::{ShardImportProgress, ShardImportStatus};
 use tokio_util::sync::CancellationToken;
-use utils::sync::gate::Gate;
 use utils::{
    id::{TenantId, TimelineId},
    shard::ShardIndex,
@@ -56,8 +55,6 @@ pub(crate) enum TimelineImportUpdateFollowUp {
 pub(crate) enum TimelineImportFinalizeError {
    #[error("Shut down interrupted import finalize")]
    ShuttingDown,
-    #[error("Import finalization was cancelled")]
-    Cancelled,
    #[error("Mismatched shard detected during import finalize: {0}")]
    MismatchedShards(ShardIndex),
 }
@@ -167,11 +164,6 @@ impl TimelineImport {
    }
 }

-pub(crate) struct FinalizingImport {
-    pub(crate) gate: Gate,
-    pub(crate) cancel: CancellationToken,
-}
-
 pub(crate) type ImportResult = Result<(), String>;

 pub(crate) struct UpcallClient {
--- a/test_runner/fixtures/fast_import.py
+++ b/test_runner/fixtures/fast_import.py
@@ -1,4 +1,3 @@
-import json
 import os
 import shutil
 import subprocess
@@ -12,7 +11,6 @@ from _pytest.config import Config

 from fixtures.log_helper import log
 from fixtures.neon_cli import AbstractNeonCli
-from fixtures.neon_fixtures import Endpoint, VanillaPostgres
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import MockS3Server

@@ -163,57 +161,3 @@ def fast_import(
            f.write(fi.cmd.stderr)

        log.info("Written logs to %s", test_output_dir)
-
-
-def mock_import_bucket(vanilla_pg: VanillaPostgres, path: Path):
-    """
-    Mock the import S3 bucket into a local directory for a provided vanilla PG instance.
-    """
-    assert not vanilla_pg.is_running()
-
-    path.mkdir()
-    # what cplane writes before scheduling fast_import
-    specpath = path / "spec.json"
-    specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"}))
-    # what fast_import writes
-    vanilla_pg.pgdatadir.rename(path / "pgdata")
-    statusdir = path / "status"
-    statusdir.mkdir()
-    (statusdir / "pgdata").write_text(json.dumps({"done": True}))
-    (statusdir / "fast_import").write_text(json.dumps({"command": "pgdata", "done": True}))
-
-
-def populate_vanilla_pg(vanilla_pg: VanillaPostgres, target_relblock_size: int) -> int:
-    assert vanilla_pg.is_running()
-
-    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
-    # fillfactor so we don't need to produce that much data
-    # 900 byte per row is > 10% => 1 row per page
-    vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""")
-
-    nrows = 0
-    while True:
-        relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
-        log.info(
-            f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages"
-        )
-        if relblock_size >= target_relblock_size:
-            break
-        addrows = int((target_relblock_size - relblock_size) // 8192)
-        assert addrows >= 1, "forward progress"
-        vanilla_pg.safe_psql(
-            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
-        )
-        nrows += addrows
-
-    return nrows
-
-
-def validate_import_from_vanilla_pg(endpoint: Endpoint, nrows: int):
-    assert endpoint.safe_psql_many(
-        [
-            "set effective_io_concurrency=32;",
-            "SET statement_timeout='300s';",
-            "select count(*), sum(data::bigint)::bigint from t",
-        ]
-    ) == [[], [], [(nrows, nrows * (nrows + 1) // 2)]]
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2337,22 +2337,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def import_status(
-        self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: int
-    ):
-        payload = {
-            "tenant_shard_id": str(tenant_shard_id),
-            "timeline_id": str(timeline_id),
-            "generation": generation,
-        }
-
-        self.request(
-            "GET",
-            f"{self.api}/upcall/v1/timeline_import_status",
-            headers=self.headers(TokenScope.GENERATIONS_API),
-            json=payload,
-        )
-
    def reconcile_all(self):
        r = self.request(
            "POST",
@@ -2829,11 +2813,6 @@ class NeonPageserver(PgProtocol, LogUtils):
        if self.running:
            self.http_client().configure_failpoints([(name, action)])

-    def clear_persistent_failpoint(self, name: str):
-        del self._persistent_failpoints[name]
-        if self.running:
-            self.http_client().configure_failpoints([(name, "off")])
-
    def timeline_dir(
        self,
        tenant_shard_id: TenantId | TenantShardId,
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -675,7 +675,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):

    def timeline_delete(
        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs
-    ) -> int:
+    ):
        """
        Note that deletion is not instant, it is scheduled and performed mostly in the background.
        So if you need to wait for it to complete use `timeline_delete_wait_completed`.
@@ -688,8 +688,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res_json = res.json()
        assert res_json is None

-        return res.status_code
-
    def timeline_gc(
        self,
        tenant_id: TenantId | TenantShardId,
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,41 +1,31 @@
 from __future__ import annotations

 import enum
-import json
 import time
 from collections import Counter
 from dataclasses import dataclass
 from enum import StrEnum
-from threading import Event
 from typing import TYPE_CHECKING

 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
-from fixtures.fast_import import mock_import_bucket, populate_vanilla_pg
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    NeonPageserver,
    PgBin,
-    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import (
-    ImportPgdataIdemptencyKey,
-)
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import human_bytes, run_only_on_default_postgres, wait_until
-from werkzeug.wrappers.response import Response
+from fixtures.utils import human_bytes, wait_until

 if TYPE_CHECKING:
    from collections.abc import Iterable
    from typing import Any

    from fixtures.pageserver.http import PageserverHttpClient
-    from pytest_httpserver import HTTPServer
-    from werkzeug.wrappers.request import Request


 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -174,7 +164,6 @@ class EvictionEnv:
        min_avail_bytes,
        mock_behavior,
        eviction_order: EvictionOrder,
-        wait_logical_size: bool = True,
    ):
        """
        Starts pageserver up with mocked statvfs setup. The startup is
@@ -212,12 +201,11 @@ class EvictionEnv:
        pageserver.start()

        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
-        if wait_logical_size:
-            for tenant_id, timeline_id in self.timelines:
-                tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
-                # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
-                if tenant_ps is not None:
-                    tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
+        for tenant_id, timeline_id in self.timelines:
+            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
+            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
+            if tenant_ps is not None:
+                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)

        def statvfs_called():
            pageserver.assert_log_contains(".*running mocked statvfs.*")
@@ -894,121 +882,3 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
    assert total_size - post_eviction_total_size >= evict_bytes, (
        "we requested at least evict_bytes worth of free space"
    )
-
-
-@run_only_on_default_postgres(reason="PG version is irrelevant here")
-def test_import_timeline_disk_pressure_eviction(
-    neon_env_builder: NeonEnvBuilder,
-    vanilla_pg: VanillaPostgres,
-    make_httpserver: HTTPServer,
-    pg_bin: PgBin,
-):
-    """
-    TODO
-    """
-    # Set up mock control plane HTTP server to listen for import completions
-    import_completion_signaled = Event()
-
-    def handler(request: Request) -> Response:
-        log.info(f"control plane /import_complete request: {request.json}")
-        import_completion_signaled.set()
-        return Response(json.dumps({}), status=200)
-
-    cplane_mgmt_api_server = make_httpserver
-    cplane_mgmt_api_server.expect_request(
-        "/storage/api/v1/import_complete", method="PUT"
-    ).respond_with_handler(handler)
-
-    # Plug the cplane mock in
-    neon_env_builder.control_plane_hooks_api = (
-        f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
-    )
-
-    # The import will specifiy a local filesystem path mocking remote storage
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    vanilla_pg.start()
-    target_relblock_size = 1024 * 1024 * 128
-    populate_vanilla_pg(vanilla_pg, target_relblock_size)
-    vanilla_pg.stop()
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket"
-    mock_import_bucket(vanilla_pg, importbucket_path)
-
-    tenant_id = TenantId.generate()
-    timeline_id = TimelineId.generate()
-    idempotency = ImportPgdataIdemptencyKey.random()
-
-    eviction_env = EvictionEnv(
-        timelines=[(tenant_id, timeline_id)],
-        neon_env=env,
-        pageserver_http=env.pageserver.http_client(),
-        layer_size=5 * 1024 * 1024,  # Doesn't apply here
-        pg_bin=pg_bin,  # Not used here
-        pgbench_init_lsns={},  # Not used here
-    )
-
-    # Pause before delivering the final notification to storcon.
-    # This keeps the import in progress.
-    failpoint_name = "import-timeline-pre-success-notify-pausable"
-    env.pageserver.add_persistent_failpoint(failpoint_name, "pause")
-
-    env.storage_controller.tenant_create(tenant_id)
-    env.storage_controller.timeline_create(
-        tenant_id,
-        {
-            "new_timeline_id": str(timeline_id),
-            "import_pgdata": {
-                "idempotency_key": str(idempotency),
-                "location": {"LocalFs": {"path": str(importbucket_path.absolute())}},
-            },
-        },
-    )
-
-    def hit_failpoint():
-        log.info("Checking log for pattern...")
-        try:
-            assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*")
-        except Exception:
-            log.exception("Failed to find pattern in log")
-            raise
-
-    wait_until(hit_failpoint)
-    assert not import_completion_signaled.is_set()
-
-    env.pageserver.stop()
-
-    total_size, _, _ = eviction_env.timelines_du(env.pageserver)
-    blocksize = 512
-    total_blocks = (total_size + (blocksize - 1)) // blocksize
-
-    eviction_env.pageserver_start_with_disk_usage_eviction(
-        env.pageserver,
-        period="1s",
-        max_usage_pct=33,
-        min_avail_bytes=0,
-        mock_behavior={
-            "type": "Success",
-            "blocksize": blocksize,
-            "total_blocks": total_blocks,
-            # Only count layer files towards used bytes in the mock_statvfs.
-            # This avoids accounting for metadata files & tenant conf in the tests.
-            "name_filter": ".*__.*",
-        },
-        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
-        wait_logical_size=False,
-    )
-
-    wait_until(lambda: env.pageserver.assert_log_contains(".*disk usage pressure relieved"))
-
-    env.pageserver.clear_persistent_failpoint(failpoint_name)
-
-    def cplane_notified():
-        assert import_completion_signaled.is_set()
-
-    wait_until(cplane_notified)
-
-    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -12,19 +12,13 @@ import psycopg2
 import psycopg2.errors
 import pytest
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
-from fixtures.fast_import import (
-    FastImport,
-    mock_import_bucket,
-    populate_vanilla_pg,
-    validate_import_from_vanilla_pg,
-)
+from fixtures.fast_import import FastImport
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PageserverImportConfig,
    PgBin,
    PgProtocol,
-    StorageControllerApiException,
    StorageControllerMigrationConfig,
    VanillaPostgres,
 )
@@ -65,6 +59,24 @@ smoke_params = [
 ]


+def mock_import_bucket(vanilla_pg: VanillaPostgres, path: Path):
+    """
+    Mock the import S3 bucket into a local directory for a provided vanilla PG instance.
+    """
+    assert not vanilla_pg.is_running()
+
+    path.mkdir()
+    # what cplane writes before scheduling fast_import
+    specpath = path / "spec.json"
+    specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"}))
+    # what fast_import writes
+    vanilla_pg.pgdatadir.rename(path / "pgdata")
+    statusdir = path / "status"
+    statusdir.mkdir()
+    (statusdir / "pgdata").write_text(json.dumps({"done": True}))
+    (statusdir / "fast_import").write_text(json.dumps({"command": "pgdata", "done": True}))
+
+
@skip_in_debug_build("MULTIPLE_RELATION_SEGMENTS has non trivial amount of data")
@pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params)
 def test_pgdata_import_smoke(
@@ -119,6 +131,10 @@ def test_pgdata_import_smoke(
    # Put data in vanilla pg
    #

+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+
+    log.info("create relblock data")
    if rel_block_size == RelBlockSize.ONE_STRIPE_SIZE:
        target_relblock_size = stripe_size * 8192
    elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD:
@@ -129,8 +145,45 @@ def test_pgdata_import_smoke(
    else:
        raise ValueError

-    vanilla_pg.start()
-    rows_inserted = populate_vanilla_pg(vanilla_pg, target_relblock_size)
+    # fillfactor so we don't need to produce that much data
+    # 900 byte per row is > 10% => 1 row per page
+    vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""")
+
+    nrows = 0
+    while True:
+        relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
+        log.info(
+            f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages"
+        )
+        if relblock_size >= target_relblock_size:
+            break
+        addrows = int((target_relblock_size - relblock_size) // 8192)
+        assert addrows >= 1, "forward progress"
+        vanilla_pg.safe_psql(
+            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
+        )
+        nrows += addrows
+    expect_nrows = nrows
+    expect_sum = (
+        (nrows) * (nrows + 1) // 2
+    )  # https://stackoverflow.com/questions/43901484/sum-of-the-integers-from-1-to-n
+
+    def validate_vanilla_equivalence(ep):
+        # TODO: would be nicer to just compare pgdump
+
+        # Enable IO concurrency for batching on large sequential scan, to avoid making
+        # this test unnecessarily onerous on CPU. Especially on debug mode, it's still
+        # pretty onerous though, so increase statement_timeout to avoid timeouts.
+        assert ep.safe_psql_many(
+            [
+                "set effective_io_concurrency=32;",
+                "SET statement_timeout='300s';",
+                "select count(*), sum(data::bigint)::bigint from t",
+            ]
+        ) == [[], [], [(expect_nrows, expect_sum)]]
+
+    validate_vanilla_equivalence(vanilla_pg)
+
    vanilla_pg.stop()

    #
@@ -221,14 +274,14 @@ def test_pgdata_import_smoke(
        config_lines=ep_config,
    )

-    validate_import_from_vanilla_pg(ro_endpoint, rows_inserted)
+    validate_vanilla_equivalence(ro_endpoint)

    # ensure the import survives restarts
    ro_endpoint.stop()
    env.pageserver.stop(immediate=True)
    env.pageserver.start()
    ro_endpoint.start()
-    validate_import_from_vanilla_pg(ro_endpoint, rows_inserted)
+    validate_vanilla_equivalence(ro_endpoint)

    #
    # validate the layer files in each shard only have the shard-specific data
@@ -268,7 +321,7 @@ def test_pgdata_import_smoke(
    child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
    child_workload.validate()

-    validate_import_from_vanilla_pg(child_workload.endpoint(), rows_inserted)
+    validate_vanilla_equivalence(child_workload.endpoint())

    # ... at the initdb lsn
    _ = env.create_branch(
@@ -283,21 +336,10 @@ def test_pgdata_import_smoke(
        tenant_id=tenant_id,
        config_lines=ep_config,
    )
-    validate_import_from_vanilla_pg(br_initdb_endpoint, rows_inserted)
+    validate_vanilla_equivalence(br_initdb_endpoint)
    with pytest.raises(psycopg2.errors.UndefinedTable):
        br_initdb_endpoint.safe_psql(f"select * from {workload.table}")

-    # The storage controller might be overly eager and attempt to finalize
-    # the import before the task got a chance to exit.
-    env.storage_controller.allowed_errors.extend(
-        [
-            ".*Call to node.*management API.*failed.*Import task still running.*",
-        ]
-    )
-
-    for ps in env.pageservers:
-        ps.allowed_errors.extend([".*Error processing HTTP request.*Import task not done yet.*"])
-

@run_only_on_default_postgres(reason="PG version is irrelevant here")
 def test_import_completion_on_restart(
@@ -381,12 +423,8 @@ def test_import_completion_on_restart(


@run_only_on_default_postgres(reason="PG version is irrelevant here")
-@pytest.mark.parametrize("action", ["restart", "delete"])
-def test_import_respects_timeline_lifecycle(
-    neon_env_builder: NeonEnvBuilder,
-    vanilla_pg: VanillaPostgres,
-    make_httpserver: HTTPServer,
-    action: str,
+def test_import_respects_tenant_shutdown(
+    neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer
 ):
    """
    Validate that importing timelines respect the usual timeline life cycle:
@@ -454,44 +492,16 @@ def test_import_respects_timeline_lifecycle(
    wait_until(hit_failpoint)
    assert not import_completion_signaled.is_set()

-    if action == "restart":
-        # Restart the pageserver while an import job is in progress.
-        # This clears the failpoint and we expect that the import starts up afresh
-        # after the restart and eventually completes.
-        env.pageserver.stop()
-        env.pageserver.start()
+    # Restart the pageserver while an import job is in progress.
+    # This clears the failpoint and we expect that the import starts up afresh
+    # after the restart and eventually completes.
+    env.pageserver.stop()
+    env.pageserver.start()

-        def cplane_notified():
-            assert import_completion_signaled.is_set()
+    def cplane_notified():
+        assert import_completion_signaled.is_set()

-        wait_until(cplane_notified)
-    elif action == "delete":
-        status = env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
-        assert status == 200
-
-        timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
-        assert not timeline_path.exists(), "Timeline dir exists after deletion"
-
-        shard_zero = TenantShardId(tenant_id, 0, 0)
-        location = env.storage_controller.inspect(shard_zero)
-        assert location is not None
-        generation = location[0]
-
-        with pytest.raises(StorageControllerApiException, match="not found"):
-            env.storage_controller.import_status(shard_zero, timeline_id, generation)
-    else:
-        raise RuntimeError(f"{action} param not recognized")
-
-    # The storage controller might be overly eager and attempt to finalize
-    # the import before the task got a chance to exit.
-    env.storage_controller.allowed_errors.extend(
-        [
-            ".*Call to node.*management API.*failed.*Import task still running.*",
-        ]
-    )
-
-    for ps in env.pageservers:
-        ps.allowed_errors.extend([".*Error processing HTTP request.*Import task not done yet.*"])
+    wait_until(cplane_notified)


@skip_in_debug_build("Validation query takes too long in debug builds")
@@ -546,8 +556,23 @@ def test_import_chaos(
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""")

-    inserted_rows = populate_vanilla_pg(vanilla_pg, TARGET_RELBOCK_SIZE)
+    nrows = 0
+    while True:
+        relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
+        log.info(
+            f"relblock size: {relblock_size / 8192} pages (target: {TARGET_RELBOCK_SIZE // 8192}) pages"
+        )
+        if relblock_size >= TARGET_RELBOCK_SIZE:
+            break
+        addrows = int((TARGET_RELBOCK_SIZE - relblock_size) // 8192)
+        assert addrows >= 1, "forward progress"
+        vanilla_pg.safe_psql(
+            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
+        )
+        nrows += addrows

    vanilla_pg.stop()

@@ -715,7 +740,13 @@ def test_import_chaos(
    endpoint = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id)

    # Validate the imported data is legit
-    validate_import_from_vanilla_pg(endpoint, inserted_rows)
+    assert endpoint.safe_psql_many(
+        [
+            "set effective_io_concurrency=32;",
+            "SET statement_timeout='300s';",
+            "select count(*), sum(data::bigint)::bigint from t",
+        ]
+    ) == [[], [], [(nrows, nrows * (nrows + 1) // 2)]]

    endpoint.stop()

--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -124,9 +124,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
                ".*downloading failed, possibly for shutdown",
                # {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active.  Current state: Stopping\n'
                ".*page_service.*will not become active.*",
-                # the following errors are possible when pageserver tries to ingest wal records despite being in unreadable state
-                ".*wal_connection_manager.*layer file download failed: No file found.*",
-                ".*wal_connection_manager.*could not ingest record.*",
            ]
        )

@@ -159,45 +156,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
        env.pageservers[2].id: ("Detached", None),
    }

-    # Track all the attached locations with mode and generation
-    history: list[tuple[int, str, int | None]] = []
-
-    def may_read(pageserver: NeonPageserver, mode: str, generation: int | None) -> bool:
-        # Rules for when a pageserver may read:
-        # - our generation is higher than any previous
-        # - our generation is equal to previous, but no other pageserver
-        #   in that generation has been AttachedSingle (i.e. allowed to compact/GC)
-        # - our generation is equal to previous, and the previous holder of this
-        #   generation was the same node as we're attaching now.
-        #
-        # If these conditions are not met, then a read _might_ work, but the pageserver might
-        # also hit errors trying to download layers.
-        highest_historic_generation = max([i[2] for i in history if i[2] is not None], default=None)
-
-        if generation is None:
-            # We're not in an attached state, we may not read
-            return False
-        elif highest_historic_generation is not None and generation < highest_historic_generation:
-            # We are in an outdated generation, we may not read
-            return False
-        elif highest_historic_generation is not None and generation == highest_historic_generation:
-            # We are re-using a generation: if any pageserver other than this one
-            # has held AttachedSingle mode, this node may not read (because some other
-            # node may be doing GC/compaction).
-            if any(
-                i[1] == "AttachedSingle"
-                and i[2] == highest_historic_generation
-                and i[0] != pageserver.id
-                for i in history
-            ):
-                log.info(
-                    f"Skipping read on {pageserver.id} because other pageserver has been in AttachedSingle mode in generation {highest_historic_generation}"
-                )
-                return False
-
-        # Fall through: we have passed conditions for readability
-        return True
-
    latest_attached = env.pageservers[0].id

    for _i in range(0, 64):
@@ -241,10 +199,9 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
                assert len(tenants) == 1
                assert tenants[0]["generation"] == new_generation

-                if may_read(pageserver, last_state_ps[0], last_state_ps[1]):
-                    log.info("Entering postgres...")
-                    workload.churn_rows(rng.randint(128, 256), pageserver.id)
-                    workload.validate(pageserver.id)
+                log.info("Entering postgres...")
+                workload.churn_rows(rng.randint(128, 256), pageserver.id)
+                workload.validate(pageserver.id)
            elif last_state_ps[0].startswith("Attached"):
                # The `storage_controller` will only re-attach on startup when a pageserver was the
                # holder of the latest generation: otherwise the pageserver will revert to detached
@@ -284,16 +241,18 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
            location_conf["generation"] = generation

            pageserver.tenant_location_configure(tenant_id, location_conf)
-
            last_state[pageserver.id] = (mode, generation)

-            may_read_this_generation = may_read(pageserver, mode, generation)
-            history.append((pageserver.id, mode, generation))
+            # It's only valid to connect to the last generation. Newer generations may yank layer
+            # files used in older generations.
+            last_generation = max(
+                [s[1] for s in last_state.values() if s[1] is not None], default=None
+            )

-            # This is a basic test: we are validating that he endpoint works properly _between_
-            # configuration changes.  A stronger test would be to validate that clients see
-            # no errors while we are making the changes.
-            if may_read_this_generation:
+            if mode.startswith("Attached") and generation == last_generation:
+                # This is a basic test: we are validating that he endpoint works properly _between_
+                # configuration changes.  A stronger test would be to validate that clients see
+                # no errors while we are making the changes.
                workload.churn_rows(
                    rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
                )
@@ -306,16 +265,9 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] > 0

-    # Attach all pageservers, in a higher generation than any previous.  We will use the same
-    # gen for all, and AttachedMulti mode so that they do not interfere with one another.
-    generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageservers[0].id)
+    # Attach all pageservers
    for ps in env.pageservers:
-        location_conf = {
-            "mode": "AttachedMulti",
-            "secondary_conf": None,
-            "tenant_conf": {},
-            "generation": generation,
-        }
+        location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
        ps.tenant_location_configure(tenant_id, location_conf)

    # Confirm that all are readable
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1388,10 +1388,12 @@ def test_sharding_split_failures(
    with pytest.raises(failure.expect_exception()):
        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)

-    def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
        secondary_count = 0
        attached_count = 0
-        log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
        for ps in env.pageservers:
            if exclude_ps_id is not None and ps.id == exclude_ps_id:
                continue
@@ -1402,23 +1404,35 @@ def test_sharding_split_failures(
                if tenant_shard_id.tenant_id != tenant_id:
                    continue  # skip bystanders
                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == shard_count
+                assert tenant_shard_id.shard_count == initial_shard_count
                if loc[1]["mode"] == "Secondary":
                    secondary_count += 1
                else:
                    attached_count += 1

-        assert secondary_count == shard_count
-        assert attached_count == shard_count
-
-    # We expect that the overall operation will fail, but some split requests
-    # will have succeeded: the net result should be to return to a clean state, including
-    # detaching any child shards.
-    def assert_rolled_back(exclude_ps_id: int | None = None) -> None:
-        assert_shard_count(initial_shard_count, exclude_ps_id)
+        assert secondary_count == initial_shard_count
+        assert attached_count == initial_shard_count

    def assert_split_done(exclude_ps_id: int | None = None) -> None:
-        assert_shard_count(split_shard_count, exclude_ps_id)
+        secondary_count = 0
+        attached_count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert attached_count == split_shard_count
+        assert secondary_count == split_shard_count

    def finish_split():
        # Having failed+rolled back, we should be able to split again
@@ -1454,7 +1468,6 @@ def test_sharding_split_failures(

        # The split should appear to be rolled back from the point of view of all pageservers
        # apart from the one that is offline
-        env.storage_controller.reconcile_until_idle()
        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))

        finish_split()
@@ -1469,7 +1482,6 @@ def test_sharding_split_failures(
        log.info("Clearing failure...")
        failure.clear(env)

-        env.storage_controller.reconcile_until_idle()
        wait_until(assert_rolled_back)

        # Having rolled back, the tenant should be working
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4192,10 +4192,10 @@ def test_storcon_create_delete_sk_down(
    # ensure the safekeeper deleted the timeline
    def timeline_deleted_on_active_sks():
        env.safekeepers[0].assert_log_contains(
-            f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
        )
        env.safekeepers[2].assert_log_contains(
-            f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
        )

    wait_until(timeline_deleted_on_active_sks)
@@ -4210,7 +4210,7 @@ def test_storcon_create_delete_sk_down(
    # ensure that there is log msgs for the third safekeeper too
    def timeline_deleted_on_sk():
        env.safekeepers[1].assert_log_contains(
-            f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
+            f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
        )

    wait_until(timeline_deleted_on_sk)
Author	SHA1	Message	Date
Erik Grinaker	90e3313dbb	wip	2025-06-01 13:33:58 +02:00
Erik Grinaker	232591e457	Fix test build	2025-05-29 12:00:40 +02:00
Erik Grinaker	8daf272561	pageserver: initial gRPC page service implementation	2025-05-28 18:10:29 +02:00