Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517

pageserver: use bounded sender for basebackup cache (#12342 )
## Problem Basebackup cache now uses unbounded channel for prepare requests. In theory it can grow large if the cache is hung and does not process the requests. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Replace an unbounded channel with a bounded one, the size is configurable. - Add `pageserver_basebackup_cache_prepare_queue_size` to observe the size of the queue. - Refactor a bit to move all metrics logic to `basebackup_cache.rs`
2026-05-27 10:00:38 +00:00 · 2025-06-26 07:32:08 -07:00 · 2025-06-26 13:26:24 +00:00 · 2025-06-26 11:25:41 +00:00 · 2025-06-26 07:06:27 +00:00 · 2025-06-25 22:15:03 +00:00
123 changed files with 2157 additions and 981 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1318,6 +1318,7 @@ dependencies = [
 "p256 0.13.2",
 "postgres",
 "postgres_initdb",
+ "postgres_versioninfo",
 "regex",
 "remote_storage",
 "reqwest",
@@ -4406,6 +4407,7 @@ dependencies = [
 "once_cell",
 "postgres_backend",
 "postgres_ffi_types",
+ "postgres_versioninfo",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -4429,6 +4431,7 @@ dependencies = [
 "futures",
 "http-utils",
 "pageserver_api",
+ "postgres_versioninfo",
 "reqwest",
 "serde",
 "thiserror 1.0.69",
@@ -4897,6 +4900,7 @@ dependencies = [
 "once_cell",
 "postgres",
 "postgres_ffi_types",
+ "postgres_versioninfo",
 "pprof",
 "regex",
 "serde",
@@ -4919,11 +4923,23 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "camino",
+ "postgres_versioninfo",
 "thiserror 1.0.69",
 "tokio",
 "workspace_hack",
 ]

+[[package]]
+name = "postgres_versioninfo"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "serde",
+ "serde_repr",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "posthog_client_lite"
 version = "0.1.0"
@@ -6115,6 +6131,7 @@ dependencies = [
 "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
+ "postgres_versioninfo",
 "pprof",
 "pq_proto",
 "rand 0.8.5",
@@ -6159,6 +6176,7 @@ dependencies = [
 "const_format",
 "pageserver_api",
 "postgres_ffi",
+ "postgres_versioninfo",
 "pq_proto",
 "serde",
 "serde_json",
@@ -6481,6 +6499,17 @@ dependencies = [
 "thiserror 1.0.69",
 ]

+[[package]]
+name = "serde_repr"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "serde_spanned"
 version = "0.6.6"
@@ -6786,6 +6815,7 @@ dependencies = [
 "hex",
 "http-utils",
 "humantime",
+ "humantime-serde",
 "hyper 0.14.30",
 "itertools 0.10.5",
 "json-structural-diff",
@@ -6796,6 +6826,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
+ "posthog_client_lite",
 "rand 0.8.5",
 "regex",
 "reqwest",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
    "libs/pageserver_api",
    "libs/postgres_ffi",
    "libs/postgres_ffi_types",
+    "libs/postgres_versioninfo",
    "libs/safekeeper_api",
    "libs/desim",
    "libs/neon-shmem",
@@ -174,6 +175,7 @@ serde_json = "1"
 serde_path_to_error = "0.1"
 serde_with = { version = "3", features = [ "base64" ] }
 serde_assert = "0.5.0"
+serde_repr = "0.1.20"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -261,6 +263,7 @@ postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" }
+postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" }
 postgres_initdb = { path = "./libs/postgres_initdb" }
 posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
--- a/2
+++ b/2
@@ -159,8 +159,6 @@ postgres-%: postgres-configure-% \
 		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
 	+@echo "Compiling PostgreSQL $*"
 	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
-	+@echo "Compiling libpq $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install
 	+@echo "Compiling pg_prewarm $*"
 	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache $*"
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -22,7 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
-		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter&pgaudit.log=none' \
 		etc/sql_exporter.jsonnet

 sql_exporter_autoscaling.yml: $(jsonnet_files)
@@ -30,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
-		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling&pgaudit.log=none' \
 		etc/sql_exporter.jsonnet

 .PHONY: clean
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -171,9 +171,6 @@ RUN cd postgres && \
    eval $CONFIGURE_CMD && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
    # Enable some of contrib extensions
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
@@ -1568,20 +1565,20 @@ ARG PG_VERSION
 WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
    "v14") \
-    export PGAUDIT_VERSION=1.6.2 \
-    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    export PGAUDIT_VERSION=1.6.3 \
+    export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \
    ;; \
    "v15") \
-    export PGAUDIT_VERSION=1.7.0 \
-    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    export PGAUDIT_VERSION=1.7.1 \
+    export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \
    ;; \
    "v16") \
-    export PGAUDIT_VERSION=16.0 \
-    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    export PGAUDIT_VERSION=16.1 \
+    export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \
    ;; \
    "v17") \
-    export PGAUDIT_VERSION=17.0 \
-    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    export PGAUDIT_VERSION=17.1 \
+    export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \
    ;; \
    *) \
    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: pgbouncer-exporter
    user: postgres
    sysvInitAction: respawn
@@ -59,7 +59,7 @@ files:
      # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
      # resolve host" log messages that they generate.
      Defaults !fqdn
-      
+
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: pgbouncer-exporter
    user: postgres
    sysvInitAction: respawn
@@ -59,7 +59,7 @@ files:
      # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
      # resolve host" log messages that they generate.
      Defaults !fqdn
-      
+
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -64,6 +64,7 @@ uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true

+postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -29,7 +29,7 @@ use anyhow::{Context, bail};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
-use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version};
+use compute_tools::extension_server::get_pg_version;
 use nix::unistd::Pid;
 use std::ops::Not;
 use tracing::{Instrument, error, info, info_span, warn};
@@ -179,12 +179,8 @@ impl PostgresProcess {
            .await
            .context("create pgdata directory")?;

-        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
-            PostgresMajorVersion::V14 => 14,
-            PostgresMajorVersion::V15 => 15,
-            PostgresMajorVersion::V16 => 16,
-            PostgresMajorVersion::V17 => 17,
-        };
+        let pg_version = get_pg_version(self.pgbin.as_ref());
+
        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
            superuser: initdb_user,
            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -405,7 +405,7 @@ impl ComputeNode {
        // that can affect `compute_ctl` and prevent it from properly configuring the database schema.
        // Unset them via connection string options before connecting to the database.
        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
-        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
+        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
        let options = match conn_conf.get_options() {
            // Allow the control plane to override any options set by the
            // compute
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,9 +74,11 @@ More specifically, here is an example ext_index.json
 use std::path::Path;
 use std::str;

+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use anyhow::{Context, Result, bail};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
+use postgres_versioninfo::PgMajorVersion;
 use regex::Regex;
 use remote_storage::*;
 use reqwest::StatusCode;
@@ -86,8 +88,6 @@ use tracing::log::warn;
 use url::Url;
 use zstd::stream::read::Decoder;

-use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
-
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
    // gives the result of `pg_config [argument]`
    // where argument is a flag like `--version` or `--sharedir`
@@ -106,7 +106,7 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
        .to_string()
 }

-pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
+pub fn get_pg_version(pgbin: &str) -> PgMajorVersion {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
@@ -114,25 +114,11 @@ pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
 }

 pub fn get_pg_version_string(pgbin: &str) -> String {
-    match get_pg_version(pgbin) {
-        PostgresMajorVersion::V14 => "v14",
-        PostgresMajorVersion::V15 => "v15",
-        PostgresMajorVersion::V16 => "v16",
-        PostgresMajorVersion::V17 => "v17",
-    }
-    .to_owned()
+    get_pg_version(pgbin).v_str()
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum PostgresMajorVersion {
-    V14,
-    V15,
-    V16,
-    V17,
-}
-
-fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
-    use PostgresMajorVersion::*;
+fn parse_pg_version(human_version: &str) -> PgMajorVersion {
+    use PgMajorVersion::*;
    // Normal releases have version strings like "PostgreSQL 15.4". But there
    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -143,10 +129,10 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
        .captures(human_version)
    {
        Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return V14,
-            "15" => return V15,
-            "16" => return V16,
-            "17" => return V17,
+            "14" => return PG14,
+            "15" => return PG15,
+            "16" => return PG16,
+            "17" => return PG17,
            _ => {}
        },
        _ => {}
@@ -343,25 +329,25 @@ mod tests {

    #[test]
    fn test_parse_pg_version() {
-        use super::PostgresMajorVersion::*;
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
+        use postgres_versioninfo::PgMajorVersion::*;
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15);
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15);
        assert_eq!(
            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            V15
+            PG15
        );

-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14);
        assert_eq!(
            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            V14
+            PG14
        );

-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16);
    }

    #[test]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -48,7 +48,7 @@ use postgres_connection::parse_host_port;
 use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId,
 };
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use tokio::task::JoinSet;
@@ -64,7 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 17;
+const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -169,7 +169,7 @@ struct TenantCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version to use for the initial timeline")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,

    #[clap(
        long,
@@ -292,7 +292,7 @@ struct TimelineCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 }

 #[derive(clap::Args)]
@@ -324,7 +324,7 @@ struct TimelineImportCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version of the backup being imported")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 }

 #[derive(clap::Subcommand)]
@@ -603,7 +603,7 @@ struct EndpointCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,

    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
    ///
@@ -1295,7 +1295,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
                    },
                    new_members: None,
                };
-                let pg_version = args.pg_version * 10000;
+                let pg_version = PgVersionId::from(args.pg_version);
                let req = safekeeper_api::models::TimelineCreateRequest {
                    tenant_id,
                    timeline_id,
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,6 +67,7 @@ use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
 use reqwest::header::CONTENT_TYPE;
+use safekeeper_api::PgMajorVersion;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
@@ -89,7 +90,7 @@ pub struct EndpointConf {
    pg_port: u16,
    external_http_port: u16,
    internal_http_port: u16,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
    grpc: bool,
    skip_pg_catalog_updates: bool,
    reconfigure_concurrency: usize,
@@ -192,7 +193,7 @@ impl ComputeControlPlane {
        pg_port: Option<u16>,
        external_http_port: Option<u16>,
        internal_http_port: Option<u16>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        mode: ComputeMode,
        grpc: bool,
        skip_pg_catalog_updates: bool,
@@ -312,7 +313,7 @@ pub struct Endpoint {
    pub internal_http_address: SocketAddr,

    // postgres major version in the format: 14, 15, etc.
-    pg_version: u32,
+    pg_version: PgMajorVersion,

    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
@@ -557,7 +558,7 @@ impl Endpoint {
                conf.append("hot_standby", "on");
                // prefetching of blocks referenced in WAL doesn't make sense for us
                // Neon hot standby ignores pages that are not in the shared_buffers
-                if self.pg_version >= 15 {
+                if self.pg_version >= PgMajorVersion::PG15 {
                    conf.append("recovery_prefetch", "off");
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,9 +12,11 @@ use std::{env, fs};

 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pageserver_api::config::PostHogConfig;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Url};
+use safekeeper_api::PgMajorVersion;
 use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
@@ -212,6 +214,8 @@ pub struct NeonStorageControllerConf {

    pub timeline_safekeeper_count: Option<i64>,

+    pub posthog_config: Option<PostHogConfig>,
+
    pub kick_secondary_downloads: Option<bool>,
 }

@@ -244,6 +248,7 @@ impl Default for NeonStorageControllerConf {
            use_https_safekeeper_api: false,
            use_local_compute_notifications: true,
            timeline_safekeeper_count: None,
+            posthog_config: None,
            kick_secondary_downloads: None,
        }
    }
@@ -424,25 +429,21 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
-        match pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(path.join(pg_version.v_str()))
    }

-    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+    pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result<PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "bin")
    }

-    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "lib")
    }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -22,6 +22,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{PgConnectionConfig, parse_host_port};
+use safekeeper_api::PgMajorVersion;
 use utils::auth::{Claims, Scope};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -142,6 +143,8 @@ impl PageServerNode {
            overrides.push(format!("ssl_ca_file='{}'", ssl_ca_file.to_str().unwrap()));
        }

+        overrides.push("dev_mode=true".to_owned());
+
        // Apply the user-provided overrides
        overrides.push({
            let mut doc =
@@ -607,7 +610,7 @@ impl PageServerNode {
        timeline_id: TimelineId,
        base: (Lsn, PathBuf),
        pg_wal: Option<(Lsn, PathBuf)>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<()> {
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -161,6 +161,7 @@ impl SafekeeperNode {
            listen_http,
            "--availability-zone".to_owned(),
            availability_zone,
+            "--dev".to_owned(),
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -6,6 +6,8 @@ use std::str::FromStr;
 use std::sync::OnceLock;
 use std::time::{Duration, Instant};

+use crate::background_process;
+use crate::local_env::{LocalEnv, NeonStorageControllerConf};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper0::Uri;
 use nix::unistd::Pid;
@@ -22,6 +24,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Method, Response};
+use safekeeper_api::PgMajorVersion;
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -31,9 +34,6 @@ use utils::auth::{Claims, Scope, encode_from_key_file};
 use utils::id::{NodeId, TenantId};
 use whoami::username;

-use crate::background_process;
-use crate::local_env::{LocalEnv, NeonStorageControllerConf};
-
 pub struct StorageController {
    env: LocalEnv,
    private_key: Option<Pem>,
@@ -48,7 +48,7 @@ pub struct StorageController {

 const COMMAND: &str = "storage_controller";

-const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16;

 const DB_NAME: &str = "storage_controller";

@@ -184,9 +184,15 @@ impl StorageController {
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
+        const PREFER_VERSIONS: [PgMajorVersion; 5] = [
+            STORAGE_CONTROLLER_POSTGRES_VERSION,
+            PgMajorVersion::PG16,
+            PgMajorVersion::PG15,
+            PgMajorVersion::PG14,
+            PgMajorVersion::PG17,
+        ];

-        for v in prefer_versions {
+        for v in PREFER_VERSIONS {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
@@ -636,6 +642,18 @@ impl StorageController {
            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
        }

+        let mut envs = vec![
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        if let Some(posthog_config) = &self.config.posthog_config {
+            envs.push((
+                "POSTHOG_CONFIG".to_string(),
+                serde_json::to_string(posthog_config)?,
+            ));
+        }
+
        println!("Starting storage controller");

        background_process::start_process(
@@ -643,10 +661,7 @@ impl StorageController {
            &instance_dir,
            &self.env.storage_controller_bin(),
            args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
+            envs,
            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
            &start_args.start_timeout,
            || async {
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -65,6 +65,7 @@ services:
                    --id=$$SAFEKEEPER_ID
                    --broker-endpoint=$$BROKER_ENDPOINT
                    -D /data
+                    --dev
                    --remote-storage=\"{endpoint='http://minio:9000',
                                        bucket_name='neon',
                                        bucket_region='eu-north-1',
@@ -95,6 +96,7 @@ services:
                    --id=$$SAFEKEEPER_ID
                    --broker-endpoint=$$BROKER_ENDPOINT
                    -D /data
+                    --dev
                    --remote-storage=\"{endpoint='http://minio:9000',
                                        bucket_name='neon',
                                        bucket_region='eu-north-1',
@@ -125,6 +127,7 @@ services:
                    --id=$$SAFEKEEPER_ID
                    --broker-endpoint=$$BROKER_ENDPOINT
                    -D /data
+                    --dev
                    --remote-storage=\"{endpoint='http://minio:9000',
                                        bucket_name='neon',
                                        bucket_region='eu-north-1',
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -6,3 +6,4 @@ remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region
 control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
 control_plane_emergency_mode=true
 virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks
+dev_mode=true
--- a/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
+++ b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
@@ -0,0 +1,396 @@
+# Memo: Endpoint Persistent Unlogged Files Storage
+Created on 2024-11-05
+Implemented on N/A
+
+## Summary
+A design for a storage system that allows storage of files required to make
+Neon's Endpoints have a better experience at or after a reboot.
+
+## Motivation
+Several systems inside PostgreSQL (and Neon) need some persistent storage for
+optimal workings across reboots and restarts, but still work without.
+Examples are the query-level statistics files of `pg_stat_statements` in
+`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`.
+We need a storage system that can store and manage these files for each
+Endpoint, without necessarily granting users access to an unlimited storage
+device.
+
+## Goals
+- Store known files for Endpoints with reasonable persistence.  
+  _Data loss in this service, while annoying and bad for UX, won't lose any
+  customer's data._
+
+## Non Goals (if relevant)
+- This storage system does not need branching, file versioning, or other such
+  features. The files are as ephemeral to the timeline of the data as the
+  Endpoints that host the data.
+- This storage system does not need to store _all_ user files, only 'known'
+  user files.
+- This storage system does not need to be hosted fully inside Computes.  
+  _Instead, this will be a separate component similar to Pageserver,
+  SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._
+
+## Impacted components
+- Compute needs new code to load and store these files in its lifetime.
+- Control Plane needs to consider this new storage system when signalling
+  the deletion of an Endpoint, Timeline, or Tenant.
+- Control Plane needs to consider this new storage system when it resets
+  or re-assigns an endpoint's timeline/branch state.
+
+A new service is created: the Endpoint Persistent Unlogged Files Storage
+service.  This could be integrated in e.g. Pageserver or Control Plane, or a
+separately hosted service.
+
+## Proposed implementation
+Endpoint-related data files are managed by a newly designed service (which
+optionally is integrated in an existing service like Pageserver or Control
+Plane), which stores data directly into S3 or any blob storage of choice.
+
+Upon deletion of the Endpoint, or reassignment of the endpoint to a different
+branch, this ephemeral data is dropped: the data stored may not match the
+state of the branch's data after reassignment, and on endpoint deletion the
+data won't have any use to the user.
+
+Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims)
+which it can use to authenticate to this new service and retrieve and store
+data associated with this endpoint.  This limited scope reduces leaks of data
+across endpoints and timeline resets, and limits the ability of endpoints to
+mess with other endpoints' data.
+
+The path of this endpoint data in S3 is initially as follows:
+
+    s3://<regional-epufs-bucket>/
+      tenants/
+        <hex-tenant-id>/
+          tenants/
+            <hex-timeline-id>/
+              endpoints/
+                <endpoint-id>/
+                  pgdata/
+                    <file_path_in_pgdatadir>
+
+For other blob storages an equivalent or similar path can be constructed.
+
+### Reliability, failure modes and corner cases (if relevant)
+Reliability is important, but not critical to the workings of Neon.  The data
+stored in this service will, when lost, reduce performance, but won't be a
+cause of permanent data loss - only operational metadata is stored.
+
+Most, if not all, blob storage services have sufficiently high persistence
+guarantees to cater our need for persistence and uptime. The only concern with
+blob storages is that the access latency is generally higher than local disk,
+but for the object types stored (cache state, ...) I don't think this will be
+much of an issue.
+
+### Interaction/Sequence diagram (if relevant)
+
+In these diagrams you can replace S3 with any persistent storage device of
+choice, but S3 is chosen as representative name: The well-known and short name
+of AWS' blob storage. Azure Blob Storage should work too, but it has a much
+longer name making it less practical for the diagrams.
+
+Write data:
+
+```http
+POST /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "version": "<opaque>", # opaque file version token, changes when the file contents change
+  "size": <bytes>,
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co-->ep: Connect with credentials
+    co->>+ep: Store Unlogged Persistent File
+    opt is authenticated
+        ep->>s3: Write UPF to S3
+    end
+    ep->>-co: OK / Failure / Auth Failure
+    co-->ep: Cancel connection
+```
+
+Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since)
+```http
+GET /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+
+<file data>
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co->>+ep: Read Unlogged Persistent File
+    opt is authenticated
+        ep->>+s3: Request UPF from storage
+        s3->>-ep: Receive UPF from storage
+    end
+    ep->>-co: OK(response) / Failure(storage, auth, ...)
+```
+
+Compute Startup:
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ps as Pageserver
+    participant ep as EPUFS
+    participant es as Extension server
+
+    note over co: Bind endpoint ep-xxx
+    par Get basebackup
+        co->>+ps: Request basebackup @ LSN
+        ps-)ps: Construct basebackup
+        ps->>-co: Receive basebackup TAR @ LSN
+    and Get startup-critical Unlogged Persistent Files
+        co->>+ep: Get all UPFs of endpoint ep-xxx
+        ep-)ep: Retrieve and gather all UPFs
+        ep->>-co: TAR of UPFs
+    and Get startup-critical extensions
+        loop For every startup-critical extension
+            co->>es: Get critical extension
+            es->>co: Receive critical extension
+        end
+    end
+    note over co: Start compute
+```
+
+CPlane ops:
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>/endpoints/<endpoint-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "endpoint": "<endpoint-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant cp as Control Plane
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    alt Tenant deleted
+        cp-)ep: Tenant deleted
+        loop For every object associated with removed tenant
+            ep->>s3: Remove data of deleted tenant from Storage
+        end
+        opt
+            ep-)cp: Tenant cleanup complete
+        end
+    alt Timeline deleted
+        cp-)ep: Timeline deleted
+        loop For every object associated with removed timeline
+            ep->>s3: Remove data of deleted timeline from Storage
+        end
+        opt
+            ep-)cp: Timeline cleanup complete
+        end
+    else Endpoint reassigned or removed
+        cp->>+ep: Endpoint reassigned
+        loop For every object associated with reassigned/removed endpoint
+            ep->>s3: Remove data from Storage
+        end
+        ep->>-cp: Cleanup complete
+    end
+```
+
+### Scalability (if relevant)
+
+Provisionally:  As this service is going to be part of compute startup, this
+service should be able to quickly respond to all requests.  Therefore this
+service is deployed to every AZ we host Computes in, and Computes communicate
+(generally) only to the EPUFS endpoint of the AZ they're hosted in.
+
+Local caching of frequently restarted endpoints' data or metadata may be
+needed for best performance.  However, due to the regional nature of stored
+data but zonal nature of the service deployment, we should be careful when we
+implement any local caching, as it is possible that computes in AZ 1 will
+update data originally written and thus cached by AZ 2.  Cache version tests
+and invalidation is therefore required if we want to roll out caching to this
+service, which is too broad a scope for an MVC.  This is why caching is left
+out of scope for this RFC, and should be considered separately after this RFC
+is implemented.
+
+### Security implications (if relevant)
+This service must be able to authenticate users at least by Tenant ID,
+Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of
+Compute, which will be upgraded to the extent needed to support Timeline- and
+Endpoint-based claims.
+
+The service requires unlimited access to (a prefix of) a blob storage bucket,
+and thus must be hosted outside the Compute VM sandbox.
+
+A service that generates pre-signed request URLs for Compute to download the
+data from that URL is likely problematic, too:  Compute would be able to write
+unlimited data to the bucket, or exfiltrate this signed URL to get read/write
+access to specific objects in this bucket, which would still effectively give
+users access to the S3 bucket (but with improved access logging).
+
+There may be a use case for transferring data associated with one endpoint to
+another endpoint (e.g. to make one endpoint warm its caches with the state of
+another endpoint), but that's not currently in scope, and specific needs may
+be solved through out-of-line communication of data or pre-signed URLs.
+
+### Unresolved questions (if relevant)
+Caching of files is not in the implementation scope of the document, but
+should at some future point be considered to maximize performance.
+
+## Alternative implementation (if relevant)
+Several ideas have come up to solve this issue:
+
+### Use AUXfile
+One prevalent idea was to WAL-log the files using our AUXfile mechanism.
+
+Benefits:
+
+ We already have this storage mechanism
+
+Demerits:
+
+- It isn't available on read replicas
+- Additional WAL will be consumed during shutdown and after the shutdown
+  checkpoint, which needs PG modifications to work without panics.
+- It increases the data we need to manage in our versioned storage, thus
+  causing higher storage costs with higher retention due to duplication at
+  the storage layer.
+
+### Sign URLs for read/write operations, instead of proxying them
+
+Benefits:
+
+ The service can be implemented with a much reduced IO budget
+
+Demerits:
+
+- Users could get access to these signed credentials
+- Not all blob storage services may implement URL signing
+
+### Give endpoints each their own directly accessed block volume
+
+Benefits:
+
+ Easier to integrate for PostgreSQL
+
+Demerits:
+
+- Little control on data size and contents
+- Potentially problematic as we'd need to store data all across the pgdata
+  directory.
+- EBS is not a good candidate
+   - Attaches in 10s of seconds, if not more; i.e. too cold to start
+   - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint
+     with users of the same EBS volumes, which can't work with VM migration
+   - EBS storage costs are very high (>80$/kilotenant when using a
+     volume/tenant)
+   - EBS volumes can't be mounted across AZ boundaries
+- Bucket per endpoint is unfeasible
+   - S3 buckets are priced at $20/month per 1k, which we could better spend
+     on developers.
+   - Allocating service accounts takes time (100s of ms), and service accounts
+     are a limited resource, too; so they're not a good candidate to allocate
+     on a per-endpoint basis.
+   - Giving credentials limited to prefix has similar issues as the pre-signed
+     URL approach.
+   - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup
+     much more than our current systems would.
+- Volumes bound by hypervisor are unlikely
+   - This requires significant investment and increased software on the
+     hypervisor.
+   - It is unclear if we can attach volumes after boot, i.e. for pooled
+     instances.
+
+### Put the files into a table
+
+Benefits:
+
+ + Mostly already available in PostgreSQL
+
+Demerits:
+
+ - Uses WAL
+   - Can't be used after shutdown checkpoint
+   - Needs a RW endpoint, and table & catalog access to write to this data
+ - Gets hit with DB size limitations
+ - Depending on user acces:
+   - Inaccessible:  
+     The user doesn't have control over database size caused by
+     these systems.
+   - Accessible:  
+     The user can corrupt these files and cause the system to crash while
+     user-corrupted files are present, thus increasing on-call overhead.
+
+## Definition of Done (if relevant)
+
+This project is done if we have:
+
+- One S3 bucket equivalent per region, which stores this per-endpoint data.
+- A new service endpoint in at least every AZ, which indirectly grants
+  endpoints access to the data stored for these endpoints in these buckets.
+- Compute writes & reads temp-data at shutdown and startup, respectively, for
+  at least the pg_prewarm or lfc_prewarm state files.
+- Cleanup of endpoint data is triggered when the endpoint is deleted or is
+  detached from its current timeline.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,6 +18,7 @@ bytes.workspace = true
 byteorder.workspace = true
 utils.workspace = true
 postgres_ffi_types.workspace = true
+postgres_versioninfo.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -63,7 +63,8 @@ impl Display for NodeMetadata {
    }
 }

-/// PostHog integration config.
+/// PostHog integration config. This is used in pageserver, storcon, and neon_local.
+/// Ensure backward compatibility when adding new fields.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PostHogConfig {
    /// PostHog project ID
@@ -76,7 +77,10 @@ pub struct PostHogConfig {
    pub private_api_url: String,
    /// Public API URL
    pub public_api_url: String,
-    /// Refresh interval for the feature flag spec
+    /// Refresh interval for the feature flag spec.
+    /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive
+    /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API.
+    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    pub refresh_interval: Option<Duration>,
@@ -367,6 +371,9 @@ pub struct BasebackupCacheConfig {
    // TODO(diko): support max_entry_size_bytes.
    // pub max_entry_size_bytes: u64,
    pub max_size_entries: usize,
+    /// Size of the channel used to send prepare requests to the basebackup cache worker.
+    /// If exceeded, new prepare requests will be dropped.
+    pub prepare_channel_size: usize,
 }

 impl Default for BasebackupCacheConfig {
@@ -376,6 +383,7 @@ impl Default for BasebackupCacheConfig {
            max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
            // max_entry_size_bytes: 16 * 1024 * 1024,   // 16 MiB
            max_size_entries: 1000,
+            prepare_channel_size: 100,
        }
    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -11,6 +11,7 @@ use std::time::{Duration, SystemTime};

 #[cfg(feature = "testing")]
 use camino::Utf8PathBuf;
+use postgres_versioninfo::PgMajorVersion;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
@@ -398,7 +399,7 @@ pub enum TimelineCreateRequestMode {
        // inherits the ancestor's pg_version. Earlier code wasn't
        // using a flattened enum, so, it was an accepted field, and
        // we continue to accept it by having it here.
-        pg_version: Option<u32>,
+        pg_version: Option<PgMajorVersion>,
        #[serde(default, skip_serializing_if = "std::ops::Not::not")]
        read_only: bool,
    },
@@ -410,7 +411,7 @@ pub enum TimelineCreateRequestMode {
    Bootstrap {
        #[serde(default)]
        existing_initdb_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
+        pg_version: Option<PgMajorVersion>,
    },
 }

@@ -1573,7 +1574,7 @@ pub struct TimelineInfo {
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
-    pub pg_version: u32,
+    pub pg_version: PgMajorVersion,

    pub state: TimelineState,

--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,6 +19,7 @@ serde.workspace = true
 postgres_ffi_types.workspace = true
 utils.workspace = true
 tracing.workspace = true
+postgres_versioninfo.workspace = true

 [dev-dependencies]
 env_logger.workspace = true
--- a/libs/postgres_ffi/benches/waldecoder.rs
+++ b/libs/postgres_ffi/benches/waldecoder.rs
@@ -4,6 +4,7 @@ use criterion::{Bencher, Criterion, criterion_group, criterion_main};
 use postgres_ffi::v17::wal_generator::LogicalMessageGenerator;
 use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_versioninfo::PgMajorVersion;
 use pprof::criterion::{Output, PProfProfiler};
 use utils::lsn::Lsn;

@@ -32,7 +33,7 @@ fn bench_complete_record(c: &mut Criterion) {
        let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX);
        let value = vec![1; value_size];

-        let mut decoder = WalStreamDecoder::new(Lsn(0), 170000);
+        let mut decoder = WalStreamDecoder::new(Lsn(0), PgMajorVersion::PG17);
        let msg = LogicalMessageGenerator::new(PREFIX, &value)
            .next()
            .unwrap()
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -14,6 +14,8 @@ use bytes::Bytes;
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

+pub use postgres_versioninfo::PgMajorVersion;
+
 macro_rules! postgres_ffi {
    ($version:ident) => {
        #[path = "."]
@@ -91,21 +93,22 @@ macro_rules! dispatch_pgversion {
            $version => $code,
            default = $invalid_pgver_handling,
            pgversions = [
-                14 : v14,
-                15 : v15,
-                16 : v16,
-                17 : v17,
+                $crate::PgMajorVersion::PG14 => v14,
+                $crate::PgMajorVersion::PG15 => v15,
+                $crate::PgMajorVersion::PG16 => v16,
+                $crate::PgMajorVersion::PG17 => v17,
            ]
        )
    };
    ($pgversion:expr => $code:expr,
     default = $default:expr,
-     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
-        match ($pgversion) {
+     pgversions = [$($sv:pat => $vsv:ident),+ $(,)?]) => {
+        match ($pgversion.clone().into()) {
            $($sv => {
                use $crate::$vsv as pgv;
                $code
            },)+
+            #[allow(unreachable_patterns)]
            _ => {
                $default
            }
@@ -179,9 +182,9 @@ macro_rules! enum_pgversion {
            $($variant ( $crate::$md::$t )),+
        }
        impl self::$name {
-            pub fn pg_version(&self) -> u32 {
+            pub fn pg_version(&self) -> PgMajorVersion {
                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
+                    pgv::bindings::MY_PGVERSION
                })
            }
        }
@@ -195,15 +198,15 @@ macro_rules! enum_pgversion {
    };
    {name = $name:ident,
     path = $p:ident,
-     typ = $t:ident,
+     $(typ = $t:ident,)?
     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
        pub enum $name {
-            $($variant ($crate::$md::$p::$t)),+
+            $($variant $(($crate::$md::$p::$t))?),+
        }
        impl $name {
-            pub fn pg_version(&self) -> u32 {
+            pub fn pg_version(&self) -> PgMajorVersion {
                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
+                    pgv::bindings::MY_PGVERSION
                })
            }
        }
@@ -249,22 +252,21 @@ pub use v14::xlog_utils::{
    try_from_pg_timestamp,
 };

-pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
+pub fn bkpimage_is_compressed(bimg_info: u8, version: PgMajorVersion) -> bool {
    dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
 }

 pub fn generate_wal_segment(
    segno: u64,
    system_id: u64,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

    dispatch_pgversion!(
        pg_version,
-        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        Err(SerializeError::BadInput)
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn)
    )
 }

@@ -272,7 +274,7 @@ pub fn generate_pg_control(
    pg_control_bytes: &[u8],
    checkpoint_bytes: &[u8],
    lsn: Lsn,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 ) -> anyhow::Result<(Bytes, u64, bool)> {
    dispatch_pgversion!(
        pg_version,
@@ -352,6 +354,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 pub mod waldecoder {
    use std::num::NonZeroU32;

+    use crate::PgMajorVersion;
    use bytes::{Buf, Bytes, BytesMut};
    use thiserror::Error;
    use utils::lsn::Lsn;
@@ -369,7 +372,7 @@ pub mod waldecoder {

    pub struct WalStreamDecoder {
        pub lsn: Lsn,
-        pub pg_version: u32,
+        pub pg_version: PgMajorVersion,
        pub inputbuf: BytesMut,
        pub state: State,
    }
@@ -382,7 +385,7 @@ pub mod waldecoder {
    }

    impl WalStreamDecoder {
-        pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder {
+        pub fn new(lsn: Lsn, pg_version: PgMajorVersion) -> WalStreamDecoder {
            WalStreamDecoder {
                lsn,
                pg_version,
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -1,3 +1,7 @@
+use crate::PgMajorVersion;
+
+pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG14;
+
 pub const XLOG_DBASE_CREATE: u8 = 0x00;
 pub const XLOG_DBASE_DROP: u8 = 0x10;

--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,3 +1,7 @@
+use crate::PgMajorVersion;
+
+pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG15;
+
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -1,3 +1,7 @@
+use crate::PgMajorVersion;
+
+pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG16;
+
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/pg_constants_v17.rs
+++ b/libs/postgres_ffi/src/pg_constants_v17.rs
@@ -1,3 +1,7 @@
+use crate::PgMajorVersion;
+
+pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG17;
+
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -9,8 +9,8 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 use crate::{
-    BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId,
-    TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
+    BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
+    RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
 };

 #[repr(C)]
@@ -199,20 +199,17 @@ impl DecodedWALRecord {
    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
-    pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+    pub fn is_dbase_create_copy(&self, pg_version: PgMajorVersion) -> bool {
        if self.xl_rmid == pg_constants::RM_DBASE_ID {
            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
            match pg_version {
-                14 => {
+                PgMajorVersion::PG14 => {
                    // Postgres 14 database creations are always the legacy kind
                    info == crate::v14::bindings::XLOG_DBASE_CREATE
                }
-                15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                _ => {
-                    panic!("Unsupported postgres version {pg_version}")
-                }
+                PgMajorVersion::PG15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                PgMajorVersion::PG16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                PgMajorVersion::PG17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
            }
        } else {
            false
@@ -248,7 +245,7 @@ impl DecodedWALRecord {
 pub fn decode_wal_record(
    record: Bytes,
    decoded: &mut DecodedWALRecord,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 ) -> anyhow::Result<()> {
    let mut rnode_spcnode: u32 = 0;
    let mut rnode_dbnode: u32 = 0;
@@ -1106,9 +1103,9 @@ pub struct XlClogTruncate {
 }

 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes, pg_version: PgMajorVersion) -> XlClogTruncate {
        XlClogTruncate {
-            pageno: if pg_version < 17 {
+            pageno: if pg_version < PgMajorVersion::PG17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -11,9 +11,9 @@ use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
+    MY_PGVERSION
 };
 use super::wal_generator::LogicalMessageGenerator;
-use super::PG_MAJORVERSION;
 use crate::pg_constants;
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
@@ -233,7 +233,7 @@ pub fn find_end_of_wal(
    let mut result = start_lsn;
    let mut curr_lsn = start_lsn;
    let mut buf = [0u8; XLOG_BLCKSZ];
-    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
+    let pg_version = MY_PGVERSION;
    debug!("find_end_of_wal PG_VERSION: {}", pg_version);

    let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -4,6 +4,7 @@ use std::str::FromStr;
 use anyhow::*;
 use clap::{Arg, ArgMatches, Command, value_parser};
 use postgres::Client;
+use postgres_ffi::PgMajorVersion;
 use wal_craft::*;

 fn main() -> Result<()> {
@@ -48,7 +49,7 @@ fn main() -> Result<()> {
        Some(("with-initdb", arg_matches)) => {
            let cfg = Conf {
                pg_version: *arg_matches
-                    .get_one::<u32>("pg-version")
+                    .get_one::<PgMajorVersion>("pg-version")
                    .context("'pg-version' is required")?,
                pg_distrib_dir: arg_matches
                    .get_one::<PathBuf>("pg-distrib-dir")
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -9,8 +9,8 @@ use log::*;
 use postgres::Client;
 use postgres::types::PgLsn;
 use postgres_ffi::{
-    WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD,
-    XLOG_SIZE_OF_XLOG_SHORT_PHD,
+    PgMajorVersion, WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD,
+    XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };

 macro_rules! xlog_utils_test {
@@ -29,7 +29,7 @@ macro_rules! xlog_utils_test {
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }

 pub struct Conf {
-    pub pg_version: u32,
+    pub pg_version: PgMajorVersion,
    pub pg_distrib_dir: PathBuf,
    pub datadir: PathBuf,
 }
@@ -52,11 +52,7 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
-        match self.pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
-            _ => bail!("Unsupported postgres version: {}", self.pg_version),
-        }
+        Ok(path.join(self.pg_version.v_str()))
    }

    fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -24,7 +24,7 @@ fn init_logging() {
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

-    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
+    let pg_version = MY_PGVERSION;

    // Craft some WAL
    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
--- a/libs/postgres_initdb/Cargo.toml
+++ b/libs/postgres_initdb/Cargo.toml
@@ -9,4 +9,5 @@ anyhow.workspace = true
 tokio.workspace = true
 camino.workspace = true
 thiserror.workspace = true
+postgres_versioninfo.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -7,12 +7,13 @@
 use std::fmt;

 use camino::Utf8Path;
+use postgres_versioninfo::PgMajorVersion;

 pub struct RunInitdbArgs<'a> {
    pub superuser: &'a str,
    pub locale: &'a str,
    pub initdb_bin: &'a Utf8Path,
-    pub pg_version: u32,
+    pub pg_version: PgMajorVersion,
    pub library_search_path: &'a Utf8Path,
    pub pgdata: &'a Utf8Path,
 }
@@ -79,12 +80,16 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
        .stderr(std::process::Stdio::piped());

    // Before version 14, only the libc provide was available.
-    if pg_version > 14 {
+    if pg_version > PgMajorVersion::PG14 {
        // Version 17 brought with it a builtin locale provider which only provides
        // C and C.UTF-8. While being safer for collation purposes since it is
        // guaranteed to be consistent throughout a major release, it is also more
        // performant.
-        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
+        let locale_provider = if pg_version >= PgMajorVersion::PG17 {
+            "builtin"
+        } else {
+            "libc"
+        };

        initdb_command.args(["--locale-provider", locale_provider]);
    }
--- a/libs/postgres_versioninfo/Cargo.toml
+++ b/libs/postgres_versioninfo/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "postgres_versioninfo"
+version = "0.1.0"
+edition = "2024"
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+thiserror.workspace = true
+serde.workspace = true
+serde_repr.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/postgres_versioninfo/src/lib.rs
+++ b/libs/postgres_versioninfo/src/lib.rs
@@ -0,0 +1,175 @@
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+use serde_repr::{Deserialize_repr, Serialize_repr};
+use std::fmt::{Display, Formatter};
+use std::str::FromStr;
+
+/// An enum with one variant for each major version of PostgreSQL that we support.
+///
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Deserialize_repr, Serialize_repr)]
+#[repr(u32)]
+pub enum PgMajorVersion {
+    PG14 = 14,
+    PG15 = 15,
+    PG16 = 16,
+    PG17 = 17,
+    // !!! When you add a new PgMajorVersion, don't forget to update PgMajorVersion::ALL
+}
+
+/// A full PostgreSQL version ID, in MMmmbb numerical format (Major/minor/bugfix)
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+#[repr(transparent)]
+pub struct PgVersionId(u32);
+
+impl PgVersionId {
+    pub const UNKNOWN: PgVersionId = PgVersionId(0);
+
+    pub fn from_full_pg_version(version: u32) -> PgVersionId {
+        match version {
+            0 => PgVersionId(version), // unknown version
+            140000..180000 => PgVersionId(version),
+            _ => panic!("Invalid full PostgreSQL version ID {version}"),
+        }
+    }
+}
+
+impl Display for PgVersionId {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        u32::fmt(&self.0, f)
+    }
+}
+
+impl Serialize for PgVersionId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        u32::serialize(&self.0, serializer)
+    }
+}
+
+impl<'de> Deserialize<'de> for PgVersionId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        u32::deserialize(deserializer).map(PgVersionId)
+    }
+
+    fn deserialize_in_place<D>(deserializer: D, place: &mut Self) -> Result<(), D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        u32::deserialize_in_place(deserializer, &mut place.0)
+    }
+}
+
+impl PgMajorVersion {
+    /// Get the numerical representation of the represented Major Version
+    pub const fn major_version_num(&self) -> u32 {
+        match self {
+            PgMajorVersion::PG14 => 14,
+            PgMajorVersion::PG15 => 15,
+            PgMajorVersion::PG16 => 16,
+            PgMajorVersion::PG17 => 17,
+        }
+    }
+
+    /// Get the contents of this version's PG_VERSION file.
+    ///
+    /// The PG_VERSION file is used to determine the PostgreSQL version that currently
+    /// owns the data in a PostgreSQL data directory.
+    pub fn versionfile_string(&self) -> &'static str {
+        match self {
+            PgMajorVersion::PG14 => "14",
+            PgMajorVersion::PG15 => "15",
+            PgMajorVersion::PG16 => "16\x0A",
+            PgMajorVersion::PG17 => "17\x0A",
+        }
+    }
+
+    /// Get the v{version} string of this major PostgreSQL version.
+    ///
+    /// Because this was hand-coded in various places, this was moved into a shared
+    /// implementation.
+    pub fn v_str(&self) -> String {
+        match self {
+            PgMajorVersion::PG14 => "v14",
+            PgMajorVersion::PG15 => "v15",
+            PgMajorVersion::PG16 => "v16",
+            PgMajorVersion::PG17 => "v17",
+        }
+        .to_string()
+    }
+
+    /// All currently supported major versions of PostgreSQL.
+    pub const ALL: &'static [PgMajorVersion] = &[
+        PgMajorVersion::PG14,
+        PgMajorVersion::PG15,
+        PgMajorVersion::PG16,
+        PgMajorVersion::PG17,
+    ];
+}
+
+impl Display for PgMajorVersion {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match self {
+            PgMajorVersion::PG14 => "PgMajorVersion::PG14",
+            PgMajorVersion::PG15 => "PgMajorVersion::PG15",
+            PgMajorVersion::PG16 => "PgMajorVersion::PG16",
+            PgMajorVersion::PG17 => "PgMajorVersion::PG17",
+        })
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[allow(dead_code)]
+pub struct InvalidPgVersion(u32);
+
+impl Display for InvalidPgVersion {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "InvalidPgVersion({})", self.0)
+    }
+}
+
+impl TryFrom<PgVersionId> for PgMajorVersion {
+    type Error = InvalidPgVersion;
+
+    fn try_from(value: PgVersionId) -> Result<Self, Self::Error> {
+        Ok(match value.0 / 10000 {
+            14 => PgMajorVersion::PG14,
+            15 => PgMajorVersion::PG15,
+            16 => PgMajorVersion::PG16,
+            17 => PgMajorVersion::PG17,
+            _ => return Err(InvalidPgVersion(value.0)),
+        })
+    }
+}
+
+impl From<PgMajorVersion> for PgVersionId {
+    fn from(value: PgMajorVersion) -> Self {
+        PgVersionId((value as u32) * 10000)
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
+pub struct PgMajorVersionParseError(String);
+
+impl Display for PgMajorVersionParseError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "PgMajorVersionParseError({})", self.0)
+    }
+}
+
+impl FromStr for PgMajorVersion {
+    type Err = PgMajorVersionParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "14" => PgMajorVersion::PG14,
+            "15" => PgMajorVersion::PG15,
+            "16" => PgMajorVersion::PG16,
+            "17" => PgMajorVersion::PG17,
+            _ => return Err(PgMajorVersionParseError(s.to_string())),
+        })
+    }
+}
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -1,17 +1,22 @@
 //! A background loop that fetches feature flags from PostHog and updates the feature store.

-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, SystemTime},
+};

 use arc_swap::ArcSwap;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, info_span};

-use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
+use crate::{
+    CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig,
+};

 /// A background loop that fetches feature flags from PostHog and updates the feature store.
 pub struct FeatureResolverBackgroundLoop {
    posthog_client: PostHogClient,
-    feature_store: ArcSwap<FeatureStore>,
+    feature_store: ArcSwap<(SystemTime, Arc<FeatureStore>)>,
    cancel: CancellationToken,
 }

@@ -19,11 +24,35 @@ impl FeatureResolverBackgroundLoop {
    pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self {
        Self {
            posthog_client: PostHogClient::new(config),
-            feature_store: ArcSwap::new(Arc::new(FeatureStore::new())),
+            feature_store: ArcSwap::new(Arc::new((
+                SystemTime::UNIX_EPOCH,
+                Arc::new(FeatureStore::new()),
+            ))),
            cancel: shutdown_pageserver,
        }
    }

+    /// Update the feature store with a new feature flag spec bypassing the normal refresh loop.
+    pub fn update(&self, spec: String) -> anyhow::Result<()> {
+        let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?;
+        self.update_feature_store_nofail(resp, "http_propagate");
+        Ok(())
+    }
+
+    fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) {
+        let project_id = self.posthog_client.config.project_id.parse::<u64>().ok();
+        match FeatureStore::new_with_flags(resp.flags, project_id) {
+            Ok(feature_store) => {
+                self.feature_store
+                    .store(Arc::new((SystemTime::now(), Arc::new(feature_store))));
+                tracing::info!("Feature flag updated from {}", source);
+            }
+            Err(e) => {
+                tracing::warn!("Cannot process feature flag spec from {}: {}", source, e);
+            }
+        }
+    }
+
    pub fn spawn(
        self: Arc<Self>,
        handle: &tokio::runtime::Handle,
@@ -47,6 +76,17 @@ impl FeatureResolverBackgroundLoop {
                        _ = ticker.tick() => {}
                        _ = cancel.cancelled() => break
                    }
+                    {
+                        let last_update = this.feature_store.load().0;
+                        if let Ok(elapsed) = last_update.elapsed() {
+                            if elapsed < refresh_period {
+                                tracing::debug!(
+                                    "Skipping feature flag refresh because it's too soon"
+                                );
+                                continue;
+                            }
+                        }
+                    }
                    let resp = match this
                        .posthog_client
                        .get_feature_flags_local_evaluation()
@@ -58,16 +98,7 @@ impl FeatureResolverBackgroundLoop {
                            continue;
                        }
                    };
-                    let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
-                    match FeatureStore::new_with_flags(resp.flags, project_id) {
-                        Ok(feature_store) => {
-                            this.feature_store.store(Arc::new(feature_store));
-                            tracing::info!("Feature flag updated");
-                        }
-                        Err(e) => {
-                            tracing::warn!("Cannot process feature flag spec: {}", e);
-                        }
-                    }
+                    this.update_feature_store_nofail(resp, "refresh_loop");
                }
                tracing::info!("PostHog feature resolver stopped");
            }
@@ -92,6 +123,6 @@ impl FeatureResolverBackgroundLoop {
    }

    pub fn feature_store(&self) -> Arc<FeatureStore> {
-        self.feature_store.load_full()
+        self.feature_store.load().1.clone()
    }
 }
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -544,17 +544,8 @@ impl PostHogClient {
        self.config.server_api_key.starts_with("phs_")
    }

-    /// Fetch the feature flag specs from the server.
-    ///
-    /// This is unfortunately an undocumented API at:
-    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
-    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
-    ///
-    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
-    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
-    pub async fn get_feature_flags_local_evaluation(
-        &self,
-    ) -> anyhow::Result<LocalEvaluationResponse> {
+    /// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing.
+    pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result<String> {
        // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
        // with bearer token of self.server_api_key
        // OR
@@ -588,7 +579,22 @@ impl PostHogClient {
                body
            ));
        }
-        Ok(serde_json::from_str(&body)?)
+        Ok(body)
+    }
+
+    /// Fetch the feature flag specs from the server.
+    ///
+    /// This is unfortunately an undocumented API at:
+    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
+    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
+    ///
+    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
+    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
+    pub async fn get_feature_flags_local_evaluation(
+        &self,
+    ) -> Result<LocalEvaluationResponse, anyhow::Error> {
+        let raw = self.get_feature_flags_local_evaluation_raw().await?;
+        Ok(serde_json::from_str(&raw)?)
    }

    /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -12,7 +12,9 @@ use tokio::net::TcpStream;

 use crate::connect::connect;
 use crate::connect_raw::{RawConnection, connect_raw};
-use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::connect_tls::connect_tls;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream};
 use crate::{Client, Connection, Error};

 /// TLS configuration.
@@ -238,7 +240,7 @@ impl Config {
        connect(tls, self).await
    }

-    pub async fn connect_raw<S, T>(
+    pub async fn tls_and_authenticate<S, T>(
        &self,
        stream: S,
        tls: T,
@@ -247,7 +249,19 @@ impl Config {
        S: AsyncRead + AsyncWrite + Unpin,
        T: TlsConnect<S>,
    {
-        connect_raw(stream, tls, self).await
+        let stream = connect_tls(stream, self.ssl_mode, tls).await?;
+        connect_raw(stream, self).await
+    }
+
+    pub async fn authenticate<S, T>(
+        &self,
+        stream: MaybeTlsStream<S, T>,
+    ) -> Result<RawConnection<S, T>, Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsStream + Unpin,
+    {
+        connect_raw(stream, self).await
    }
 }

--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -9,6 +9,7 @@ use crate::codec::BackendMessage;
 use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
+use crate::connect_tls::connect_tls;
 use crate::tls::{MakeTlsConnect, TlsConnect};
 use crate::{Client, Config, Connection, Error, RawConnection};

@@ -44,13 +45,14 @@ where
    T: TlsConnect<TcpStream>,
 {
    let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
+    let stream = connect_tls(socket, config.ssl_mode, tls).await?;
    let RawConnection {
        stream,
        parameters,
        delayed_notice,
        process_id,
        secret_key,
-    } = connect_raw(socket, tls, config).await?;
+    } = connect_raw(stream, config).await?;

    let socket_config = SocketConfig {
        host_addr,
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -16,9 +16,8 @@ use tokio_util::codec::Framed;
 use crate::Error;
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
 use crate::config::{self, AuthKeys, Config};
-use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
-use crate::tls::{TlsConnect, TlsStream};
+use crate::tls::TlsStream;

 pub struct StartupStream<S, T> {
    inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
@@ -87,16 +86,13 @@ pub struct RawConnection<S, T> {
 }

 pub async fn connect_raw<S, T>(
-    stream: S,
-    tls: T,
+    stream: MaybeTlsStream<S, T>,
    config: &Config,
-) -> Result<RawConnection<S, T::Stream>, Error>
+) -> Result<RawConnection<S, T>, Error>
 where
    S: AsyncRead + AsyncWrite + Unpin,
-    T: TlsConnect<S>,
+    T: TlsStream + Unpin,
 {
-    let stream = connect_tls(stream, config.ssl_mode, tls).await?;
-
    let mut stream = StartupStream {
        inner: Framed::new(stream, PostgresCodec),
        buf: BackendMessages::empty(),
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -10,6 +10,7 @@ const_format.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 postgres_ffi.workspace = true
+postgres_versioninfo.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
 utils.workspace = true
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -8,6 +8,8 @@ pub mod membership;
 /// Public API types
 pub mod models;

+pub use postgres_versioninfo::{PgMajorVersion, PgVersionId};
+
 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
 /// With this term timeline is created initially. It
@@ -20,7 +22,7 @@ pub const INITIAL_TERM: Term = 0;
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ServerInfo {
    /// Postgres server version
-    pub pg_version: u32,
+    pub pg_version: PgVersionId,
    pub system_id: SystemId,
    pub wal_seg_size: u32,
 }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -4,6 +4,7 @@ use std::net::SocketAddr;

 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
+use postgres_versioninfo::PgVersionId;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
@@ -23,8 +24,7 @@ pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mconf: Configuration,
-    /// In the PG_VERSION_NUM macro format, like 140017.
-    pub pg_version: u32,
+    pub pg_version: PgVersionId,
    pub system_id: Option<u64>,
    // By default WAL_SEGMENT_SIZE
    pub wal_seg_size: Option<u32>,
--- a/libs/wal_decoder/benches/bench_interpret_wal.rs
+++ b/libs/wal_decoder/benches/bench_interpret_wal.rs
@@ -10,7 +10,7 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, WAL_SEGMENT_SIZE};
 use pprof::criterion::{Output, PProfProfiler};
 use remote_storage::{
    DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
@@ -115,7 +115,7 @@ struct BenchmarkData {

 #[derive(Deserialize)]
 struct BenchmarkMetadata {
-    pg_version: u32,
+    pg_version: PgMajorVersion,
    start_lsn: Lsn,
 }

--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -7,8 +7,8 @@ use bytes::{Buf, Bytes};
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::pg_constants;
 use postgres_ffi::walrecord::*;
+use postgres_ffi::{PgMajorVersion, pg_constants};
 use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM;
 use utils::lsn::Lsn;

@@ -24,7 +24,7 @@ impl InterpretedWalRecord {
        buf: Bytes,
        shards: &[ShardIdentity],
        next_record_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
        let mut decoded = DecodedWALRecord::default();
        decode_wal_record(buf, &mut decoded, pg_version)?;
@@ -78,7 +78,7 @@ impl MetadataRecord {
        decoded: &DecodedWALRecord,
        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<()> {
        // Note: this doesn't actually copy the bytes since
        // the [`Bytes`] type implements it via a level of indirection.
@@ -193,7 +193,7 @@ impl MetadataRecord {
    fn decode_heapam_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // Handle VM bit updates that are implicitly part of heap records.

@@ -205,7 +205,7 @@ impl MetadataRecord {
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

        match pg_version {
-            14 => {
+            PgMajorVersion::PG14 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -272,7 +272,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            15 => {
+            PgMajorVersion::PG15 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -339,7 +339,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            16 => {
+            PgMajorVersion::PG16 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -406,7 +406,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            17 => {
+            PgMajorVersion::PG17 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -473,7 +473,6 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            _ => {}
        }

        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -500,7 +499,7 @@ impl MetadataRecord {
    fn decode_neonmgr_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // Handle VM bit updates that are implicitly part of heap records.

@@ -514,7 +513,7 @@ impl MetadataRecord {
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match pg_version {
-            16 | 17 => {
+            PgMajorVersion::PG16 | PgMajorVersion::PG17 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

                match info {
@@ -574,7 +573,7 @@ impl MetadataRecord {
                    info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
                }
            }
-            _ => anyhow::bail!(
+            PgMajorVersion::PG15 | PgMajorVersion::PG14 => anyhow::bail!(
                "Neon RMGR has no known compatibility with PostgreSQL version {}",
                pg_version
            ),
@@ -629,116 +628,121 @@ impl MetadataRecord {
    fn decode_dbase_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // TODO: Refactor this to avoid the duplication between postgres versions.

        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");

-        if pg_version == 14 {
-            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
-                let createdb = XlCreateDatabase::decode(buf);
-                tracing::debug!("XLOG_DBASE_CREATE v14");
+        match pg_version {
+            PgMajorVersion::PG14 => {
+                if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                    let createdb = XlCreateDatabase::decode(buf);
+                    tracing::debug!("XLOG_DBASE_CREATE v14");

-                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                }));
+                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                        db_id: createdb.db_id,
+                        tablespace_id: createdb.tablespace_id,
+                        src_db_id: createdb.src_db_id,
+                        src_tablespace_id: createdb.src_tablespace_id,
+                    }));

-                return Ok(Some(record));
-            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
+                    return Ok(Some(record));
+                } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                    let dropdb = XlDropDatabase::decode(buf);

-                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                }));
+                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                        db_id: dropdb.db_id,
+                        tablespace_ids: dropdb.tablespace_ids,
+                    }));

-                return Ok(Some(record));
+                    return Ok(Some(record));
+                }
            }
-        } else if pg_version == 15 {
-            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+            PgMajorVersion::PG15 => {
+                if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                    // The XLOG record was renamed between v14 and v15,
+                    // but the record format is the same.
+                    // So we can reuse XlCreateDatabase here.
+                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                let createdb = XlCreateDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                }));
+                    let createdb = XlCreateDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                        db_id: createdb.db_id,
+                        tablespace_id: createdb.tablespace_id,
+                        src_db_id: createdb.src_db_id,
+                        src_tablespace_id: createdb.src_tablespace_id,
+                    }));

-                return Ok(Some(record));
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                }));
+                    return Ok(Some(record));
+                } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                    let dropdb = XlDropDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                        db_id: dropdb.db_id,
+                        tablespace_ids: dropdb.tablespace_ids,
+                    }));

-                return Ok(Some(record));
+                    return Ok(Some(record));
+                }
            }
-        } else if pg_version == 16 {
-            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+            PgMajorVersion::PG16 => {
+                if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                    // The XLOG record was renamed between v14 and v15,
+                    // but the record format is the same.
+                    // So we can reuse XlCreateDatabase here.
+                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                let createdb = XlCreateDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                }));
+                    let createdb = XlCreateDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                        db_id: createdb.db_id,
+                        tablespace_id: createdb.tablespace_id,
+                        src_db_id: createdb.src_db_id,
+                        src_tablespace_id: createdb.src_tablespace_id,
+                    }));

-                return Ok(Some(record));
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                }));
+                    return Ok(Some(record));
+                } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                    let dropdb = XlDropDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                        db_id: dropdb.db_id,
+                        tablespace_ids: dropdb.tablespace_ids,
+                    }));

-                return Ok(Some(record));
+                    return Ok(Some(record));
+                }
            }
-        } else if pg_version == 17 {
-            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+            PgMajorVersion::PG17 => {
+                if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                    // The XLOG record was renamed between v14 and v15,
+                    // but the record format is the same.
+                    // So we can reuse XlCreateDatabase here.
+                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                let createdb = XlCreateDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                }));
+                    let createdb = XlCreateDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                        db_id: createdb.db_id,
+                        tablespace_id: createdb.tablespace_id,
+                        src_db_id: createdb.src_db_id,
+                        src_tablespace_id: createdb.src_tablespace_id,
+                    }));

-                return Ok(Some(record));
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                }));
+                    return Ok(Some(record));
+                } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                    let dropdb = XlDropDatabase::decode(buf);
+                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                        db_id: dropdb.db_id,
+                        tablespace_ids: dropdb.tablespace_ids,
+                    }));

-                return Ok(Some(record));
+                    return Ok(Some(record));
+                }
            }
        }

@@ -748,12 +752,12 @@ impl MetadataRecord {
    fn decode_clog_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;

        if info == pg_constants::CLOG_ZEROPAGE {
-            let pageno = if pg_version < 17 {
+            let pageno = if pg_version < PgMajorVersion::PG17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
@@ -765,7 +769,7 @@ impl MetadataRecord {
                ClogZeroPage { segno, rpageno },
            ))))
        } else {
-            assert!(info == pg_constants::CLOG_TRUNCATE);
+            assert_eq!(info, pg_constants::CLOG_TRUNCATE);
            let xlrec = XlClogTruncate::decode(buf, pg_version);

            Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
@@ -838,14 +842,14 @@ impl MetadataRecord {
    fn decode_multixact_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
        {
-            let pageno = if pg_version < 17 {
+            let pageno = if pg_version < PgMajorVersion::PG17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -13,7 +13,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
-use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants};
+use postgres_ffi::{BLCKSZ, PgMajorVersion, page_is_new, page_set_lsn, pg_constants};
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
@@ -139,7 +139,7 @@ impl SerializedValueBatch {
        decoded: DecodedWALRecord,
        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<()> {
        // First determine how big the buffers need to be and allocate it up-front.
        // This duplicates some of the work below, but it's empirically much faster.
@@ -267,7 +267,7 @@ impl SerializedValueBatch {
    fn estimate_buffer_size(
        decoded: &DecodedWALRecord,
        shard: &ShardIdentity,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> usize {
        let mut estimate: usize = 0;

@@ -303,7 +303,11 @@ impl SerializedValueBatch {
        estimate
    }

-    fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
+    fn block_is_image(
+        decoded: &DecodedWALRecord,
+        blk: &DecodedBkpBlock,
+        pg_version: PgMajorVersion,
+    ) -> bool {
        blk.apply_image
            && blk.has_image
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -18,6 +18,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio.workspace = true
+postgres_versioninfo.workspace = true
 futures.workspace = true
 tokio-util.workspace = true
 anyhow.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -7,6 +7,7 @@ use detach_ancestor::AncestorDetached;
 use http_utils::error::HttpErrorBody;
 use pageserver_api::models::*;
 use pageserver_api::shard::TenantShardId;
+use postgres_versioninfo::PgMajorVersion;
 pub use reqwest::Body as ReqwestBody;
 use reqwest::{IntoUrl, Method, StatusCode, Url};
 use utils::id::{TenantId, TimelineId};
@@ -745,9 +746,11 @@ impl Client {
        timeline_id: TimelineId,
        base_lsn: Lsn,
        end_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        basebackup_tarball: ReqwestBody,
    ) -> Result<()> {
+        let pg_version = pg_version.major_version_num();
+
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
            self.mgmt_api_endpoint,
@@ -841,4 +844,13 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
+
+    pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
+        let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
+        self.request(Method::POST, uri, spec)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -110,6 +110,19 @@ message GetBaseBackupRequest {
  bool replica = 2;
  // If true, include relation files in the base backup. Mainly for debugging and tests.
  bool full = 3;
+  // Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
+  // compression, so that we can cache compressed backups on the server.
+  BaseBackupCompression compression = 4;
+}
+
+// Base backup compression algorithms.
+enum BaseBackupCompression {
+  // Unknown algorithm. Used when clients send an unsupported algorithm.
+  BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
+  // No compression.
+  BASE_BACKUP_COMPRESSION_NONE = 1;
+  // GZIP compression.
+  BASE_BACKUP_COMPRESSION_GZIP = 2;
 }

 // Base backup response chunk, returned as an ordered stream.
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -95,7 +95,6 @@ impl Client {

        if let Some(compression) = compression {
            // TODO: benchmark this (including network latency).
-            // TODO: consider enabling compression by default.
            client = client
                .accept_compressed(compression)
                .send_compressed(compression);
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -191,15 +191,21 @@ pub struct GetBaseBackupRequest {
    pub replica: bool,
    /// If true, include relation files in the base backup. Mainly for debugging and tests.
    pub full: bool,
+    /// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
+    /// compression, so that we can cache compressed backups on the server.
+    pub compression: BaseBackupCompression,
 }

-impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
-    fn from(pb: proto::GetBaseBackupRequest) -> Self {
-        Self {
+impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
            lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
            replica: pb.replica,
            full: pb.full,
-        }
+            compression: pb.compression.try_into()?,
+        })
    }
 }

@@ -209,10 +215,55 @@ impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
            lsn: request.lsn.unwrap_or_default().0,
            replica: request.replica,
            full: request.full,
+            compression: request.compression.into(),
        }
    }
 }

+/// Base backup compression algorithm.
+#[derive(Clone, Copy, Debug)]
+pub enum BaseBackupCompression {
+    None,
+    Gzip,
+}
+
+impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
+        match pb {
+            proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
+            proto::BaseBackupCompression::None => Ok(Self::None),
+            proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
+        }
+    }
+}
+
+impl TryFrom<i32> for BaseBackupCompression {
+    type Error = ProtocolError;
+
+    fn try_from(compression: i32) -> Result<Self, Self::Error> {
+        proto::BaseBackupCompression::try_from(compression)
+            .map_err(|_| ProtocolError::invalid("compression", compression))
+            .and_then(Self::try_from)
+    }
+}
+
+impl From<BaseBackupCompression> for proto::BaseBackupCompression {
+    fn from(compression: BaseBackupCompression) -> Self {
+        match compression {
+            BaseBackupCompression::None => Self::None,
+            BaseBackupCompression::Gzip => Self::Gzip,
+        }
+    }
+}
+
+impl From<BaseBackupCompression> for i32 {
+    fn from(compression: BaseBackupCompression) -> Self {
+        proto::BaseBackupCompression::from(compression).into()
+    }
+}
+
 pub type GetBaseBackupResponseChunk = Bytes;

 impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -317,6 +317,7 @@ impl Client for LibpqClient {
 /// A gRPC Pageserver client.
 struct GrpcClient {
    inner: page_api::Client,
+    compression: page_api::BaseBackupCompression,
 }

 impl GrpcClient {
@@ -331,10 +332,14 @@ impl GrpcClient {
            ttid.timeline_id,
            ShardIndex::unsharded(),
            None,
-            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+            None, // NB: uses payload compression
        )
        .await?;
-        Ok(Self { inner })
+        let compression = match compression {
+            true => page_api::BaseBackupCompression::Gzip,
+            false => page_api::BaseBackupCompression::None,
+        };
+        Ok(Self { inner, compression })
    }
 }

@@ -348,6 +353,7 @@ impl Client for GrpcClient {
            lsn,
            replica: false,
            full: false,
+            compression: self.compression,
        };
        let stream = self.inner.get_base_backup(req).await?;
        Ok(Box::pin(StreamReader::new(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -14,18 +14,19 @@ use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};

 use anyhow::{Context, anyhow};
+use async_compression::tokio::write::GzipEncoder;
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES};
 use postgres_ffi::{
-    BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants,
+    BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName,
+    dispatch_pgversion, pg_constants,
 };
 use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
-use tokio::io;
-use tokio::io::AsyncWrite;
+use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
 use tokio_tar::{Builder, EntryType, Header};
 use tracing::*;
 use utils::lsn::Lsn;
@@ -96,6 +97,7 @@ impl From<BasebackupError> for tonic::Status {
 ///  * When working without safekeepers. In this situation it is important to match the lsn
 ///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 ///    to start the replication.
+#[allow(clippy::too_many_arguments)]
 pub async fn send_basebackup_tarball<'a, W>(
    write: &'a mut W,
    timeline: &'a Timeline,
@@ -103,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    replica: bool,
+    gzip_level: Option<async_compression::Level>,
    ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -121,7 +124,7 @@ where
    // prev_lsn value; that happens if the timeline was just branched from
    // an old LSN and it doesn't have any WAL of its own yet. We will set
    // prev_lsn to Lsn(0) if we cannot provide the correct value.
-    let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+    let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
        // Backup was requested at a particular LSN. The caller should've
        // already checked that it's a valid LSN.

@@ -142,7 +145,7 @@ where
    };

    // Consolidate the derived and the provided prev_lsn values
-    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+    let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
            return Err(BasebackupError::Server(anyhow!(
                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
@@ -154,30 +157,55 @@ where
    };

    info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
-        backup_lsn, prev_lsn, full_backup, replica
+        "taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
+        (full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
+    );
+    let span = info_span!("send_tarball", backup_lsn=%lsn);
+
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        timeline.conf.get_vectored_concurrent_io,
+        timeline
+            .gate
+            .enter()
+            .map_err(|_| BasebackupError::Shutdown)?,
    );

-    let basebackup = Basebackup {
-        ar: Builder::new_non_terminated(write),
-        timeline,
-        lsn: backup_lsn,
-        prev_record_lsn: prev_lsn,
-        full_backup,
-        replica,
-        ctx,
-        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf.get_vectored_concurrent_io,
-            timeline
-                .gate
-                .enter()
-                .map_err(|_| BasebackupError::Shutdown)?,
-        ),
-    };
-    basebackup
+    if let Some(gzip_level) = gzip_level {
+        let mut encoder = GzipEncoder::with_quality(write, gzip_level);
+        Basebackup {
+            ar: Builder::new_non_terminated(&mut encoder),
+            timeline,
+            lsn,
+            prev_record_lsn,
+            full_backup,
+            replica,
+            ctx,
+            io_concurrency,
+        }
        .send_tarball()
-        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
-        .await
+        .instrument(span)
+        .await?;
+        encoder
+            .shutdown()
+            .await
+            .map_err(|err| BasebackupError::Client(err, "gzip"))?;
+    } else {
+        Basebackup {
+            ar: Builder::new_non_terminated(write),
+            timeline,
+            lsn,
+            prev_record_lsn,
+            full_backup,
+            replica,
+            ctx,
+            io_concurrency,
+        }
+        .send_tarball()
+        .instrument(span)
+        .await?;
+    }
+
+    Ok(())
 }

 /// This is short-living object only for the time of tarball creation,
@@ -619,10 +647,7 @@ where
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = match self.timeline.pg_version {
-                14 | 15 => self.timeline.pg_version.to_string(),
-                ver => format!("{ver}\x0A"),
-            };
+            let pg_version_str = self.timeline.pg_version.versionfile_string();
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar
                .append(&header, pg_version_str.as_bytes())
@@ -679,10 +704,7 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{dbnode}/PG_VERSION");

-                let pg_version_str = match self.timeline.pg_version {
-                    14 | 15 => self.timeline.pg_version.to_string(),
-                    ver => format!("{ver}\x0A"),
-                };
+                let pg_version_str = self.timeline.pg_version.versionfile_string();
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar
                    .append(&header, pg_version_str.as_bytes())
@@ -713,7 +735,7 @@ where
        buf.extend_from_slice(&img[..]);
        let crc = crc32c::crc32c(&img[..]);
        buf.put_u32_le(crc);
-        let path = if self.timeline.pg_version < 17 {
+        let path = if self.timeline.pg_version < PgMajorVersion::PG17 {
            format!("pg_twophase/{xid:>08X}")
        } else {
            format!("pg_twophase/{xid:>016X}")
--- a/pageserver/src/basebackup_cache.rs
+++ b/pageserver/src/basebackup_cache.rs
@@ -1,13 +1,12 @@
 use std::{collections::HashMap, sync::Arc};

 use anyhow::Context;
-use async_compression::tokio::write::GzipEncoder;
 use camino::{Utf8Path, Utf8PathBuf};
 use metrics::core::{AtomicU64, GenericCounter};
 use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
 use tokio::{
    io::{AsyncWriteExt, BufWriter},
-    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+    sync::mpsc::{Receiver, Sender, error::TrySendError},
 };
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -20,8 +19,8 @@ use crate::{
    basebackup::send_basebackup_tarball,
    context::{DownloadBehavior, RequestContext},
    metrics::{
-        BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
-        BASEBACKUP_CACHE_SIZE,
+        BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
+        BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
    },
    task_mgr::TaskKind,
    tenant::{
@@ -36,8 +35,8 @@ pub struct BasebackupPrepareRequest {
    pub lsn: Lsn,
 }

-pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
-pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
+pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
+pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;

 #[derive(Clone)]
 struct CacheEntry {
@@ -61,40 +60,65 @@ struct CacheEntry {
 /// and ~1 RPS for get requests.
 pub struct BasebackupCache {
    data_dir: Utf8PathBuf,
+    config: Option<BasebackupCacheConfig>,

    entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,

+    prepare_sender: BasebackupPrepareSender,
+
    read_hit_count: GenericCounter<AtomicU64>,
    read_miss_count: GenericCounter<AtomicU64>,
    read_err_count: GenericCounter<AtomicU64>,
+
+    prepare_skip_count: GenericCounter<AtomicU64>,
 }

 impl BasebackupCache {
-    /// Creates a BasebackupCache and spawns the background task.
-    /// The initialization of the cache is performed in the background and does not
-    /// block the caller. The cache will return `None` for any get requests until
-    /// initialization is complete.
-    pub fn spawn(
-        runtime_handle: &tokio::runtime::Handle,
+    /// Create a new BasebackupCache instance.
+    /// Also returns a BasebackupPrepareReceiver which is needed to start
+    /// the background task.
+    /// The cache is initialized from the data_dir in the background task.
+    /// The cache will return `None` for any get requests until the initialization is complete.
+    /// The background task is spawned separately using [`Self::spawn_background_task`]
+    /// to avoid a circular dependency between the cache and the tenant manager.
+    pub fn new(
        data_dir: Utf8PathBuf,
        config: Option<BasebackupCacheConfig>,
-        prepare_receiver: BasebackupPrepareReceiver,
-        tenant_manager: Arc<TenantManager>,
-        cancel: CancellationToken,
-    ) -> Arc<Self> {
+    ) -> (Arc<Self>, BasebackupPrepareReceiver) {
+        let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
+
+        let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
+
        let cache = Arc::new(BasebackupCache {
            data_dir,
-
+            config,
            entries: std::sync::Mutex::new(HashMap::new()),
+            prepare_sender,

            read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
            read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
            read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
+
+            prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
        });

-        if let Some(config) = config {
+        (cache, prepare_receiver)
+    }
+
+    /// Spawns the background task.
+    /// The background task initializes the cache from the disk,
+    /// processes prepare requests, and cleans up outdated cache entries.
+    /// Noop if the cache is disabled (config is None).
+    pub fn spawn_background_task(
+        self: Arc<Self>,
+        runtime_handle: &tokio::runtime::Handle,
+        prepare_receiver: BasebackupPrepareReceiver,
+        tenant_manager: Arc<TenantManager>,
+        cancel: CancellationToken,
+    ) {
+        if let Some(config) = self.config.clone() {
            let background = BackgroundTask {
-                c: cache.clone(),
+                c: self,

                config,
                tenant_manager,
@@ -109,8 +133,45 @@ impl BasebackupCache {
            };
            runtime_handle.spawn(background.run(prepare_receiver));
        }
+    }

-        cache
+    /// Send a basebackup prepare request to the background task.
+    /// The basebackup will be prepared asynchronously, it does not block the caller.
+    /// The request will be skipped if any cache limits are exceeded.
+    pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
+        let req = BasebackupPrepareRequest {
+            tenant_shard_id,
+            timeline_id,
+            lsn,
+        };
+
+        BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
+        let res = self.prepare_sender.try_send(req);
+
+        if let Err(e) = res {
+            BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
+            self.prepare_skip_count.inc();
+            match e {
+                TrySendError::Full(_) => {
+                    // Basebackup prepares are pretty rare, normally we should not hit this.
+                    tracing::info!(
+                        tenant_id = %tenant_shard_id.tenant_id,
+                        %timeline_id,
+                        %lsn,
+                        "Basebackup prepare channel is full, skipping the request"
+                    );
+                }
+                TrySendError::Closed(_) => {
+                    // Normal during shutdown, not critical.
+                    tracing::info!(
+                        tenant_id = %tenant_shard_id.tenant_id,
+                        %timeline_id,
+                        %lsn,
+                        "Basebackup prepare channel is closed, skipping the request"
+                    );
+                }
+            }
+        }
    }

    /// Gets a basebackup entry from the cache.
@@ -123,6 +184,10 @@ impl BasebackupCache {
        timeline_id: TimelineId,
        lsn: Lsn,
    ) -> Option<tokio::fs::File> {
+        if !self.is_enabled() {
+            return None;
+        }
+
        // Fast path. Check if the entry exists using the in-memory state.
        let tti = TenantTimelineId::new(tenant_id, timeline_id);
        if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
@@ -150,6 +215,10 @@ impl BasebackupCache {
        }
    }

+    pub fn is_enabled(&self) -> bool {
+        self.config.is_some()
+    }
+
    // Private methods.

    fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
@@ -367,6 +436,7 @@ impl BackgroundTask {
        loop {
            tokio::select! {
                Some(req) = prepare_receiver.recv() => {
+                    BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
                    if let Err(err) = self.prepare_basebackup(
                        req.tenant_shard_id,
                        req.timeline_id,
@@ -594,13 +664,6 @@ impl BackgroundTask {
        let file = tokio::fs::File::create(entry_tmp_path).await?;
        let mut writer = BufWriter::new(file);

-        let mut encoder = GzipEncoder::with_quality(
-            &mut writer,
-            // Level::Best because compression is not on the hot path of basebackup requests.
-            // The decompression is almost not affected by the compression level.
-            async_compression::Level::Best,
-        );
-
        // We may receive a request before the WAL record is applied to the timeline.
        // Wait for the requested LSN to be applied.
        timeline
@@ -613,17 +676,19 @@ impl BackgroundTask {
            .await?;

        send_basebackup_tarball(
-            &mut encoder,
+            &mut writer,
            timeline,
            Some(req_lsn),
            None,
            false,
            false,
+            // Level::Best because compression is not on the hot path of basebackup requests.
+            // The decompression is almost not affected by the compression level.
+            Some(async_compression::Level::Best),
            &ctx,
        )
        .await?;

-        encoder.shutdown().await?;
        writer.flush().await?;
        writer.into_inner().sync_all().await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,7 +9,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, anyhow};
+use anyhow::{Context, anyhow, bail};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 use http_utils::tls_certs::ReloadingCertificateResolver;
@@ -102,6 +102,19 @@ fn main() -> anyhow::Result<()> {

    let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;

+    if !conf.dev_mode {
+        if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type].contains(&AuthType::Trust)
+        {
+            bail!(
+                "Pageserver refuses to start with HTTP, PostgreSQL or GRPC API authentication disabled.\n\
+                  Set dev_mode = true in pageserver.toml to allow running without authentication.\n\
+                  This is insecure and should only be used in development environments."
+            );
+        }
+    } else {
+        warn!("Starting in dev mode: this may be an insecure configuration.");
+    }
+
    // Initialize logging.
    //
    // It must be initialized before the custom panic hook is installed below.
@@ -569,8 +582,10 @@ fn start_pageserver(
        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());

    // Scan the local 'tenants/' directory and start loading the tenants
-    let (basebackup_prepare_sender, basebackup_prepare_receiver) =
-        tokio::sync::mpsc::unbounded_channel();
+    let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
+        conf.basebackup_cache_dir(),
+        conf.basebackup_cache_config.clone(),
+    );
    let deletion_queue_client = deletion_queue.new_client();
    let background_purges = mgr::BackgroundPurges::default();

@@ -582,7 +597,7 @@ fn start_pageserver(
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache: Arc::clone(&basebackup_cache),
            feature_resolver: feature_resolver.clone(),
        },
        shutdown_pageserver.clone(),
@@ -590,10 +605,8 @@ fn start_pageserver(
    let tenant_manager = Arc::new(tenant_manager);
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;

-    let basebackup_cache = BasebackupCache::spawn(
+    basebackup_cache.spawn_background_task(
        BACKGROUND_RUNTIME.handle(),
-        conf.basebackup_cache_dir(),
-        conf.basebackup_cache_config.clone(),
        basebackup_prepare_receiver,
        Arc::clone(&tenant_manager),
        shutdown_pageserver.child_token(),
@@ -806,7 +819,6 @@ fn start_pageserver(
        } else {
            None
        },
-        basebackup_cache,
    );

    // Spawn a Pageserver gRPC server task. It will spawn separate tasks for
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,7 +11,7 @@ use std::num::NonZeroUsize;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, bail, ensure};
+use anyhow::{Context, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
@@ -22,6 +22,7 @@ use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use pem::Pem;
 use postgres_backend::AuthType;
+use postgres_ffi::PgMajorVersion;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use reqwest::Url;
 use storage_broker::Uri;
@@ -338,20 +339,16 @@ impl PageServerConf {
    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
-        match pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(path.join(pg_version.v_str()))
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

@@ -765,4 +762,23 @@ mod tests {
        let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
        assert_eq!(result.is_ok(), is_valid);
    }
+
+    #[test]
+    fn test_config_posthog_config_is_valid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+
+            [posthog_config]
+            server_api_key = "phs_AAA"
+            client_api_key = "phc_BBB"
+            project_id = "000"
+            private_api_url = "https://us.posthog.com"
+            public_api_url = "https://us.i.posthog.com"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("posthogconfig is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
+    }
 }
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -31,6 +31,13 @@ impl FeatureResolver {
        }
    }

+    pub fn update(&self, spec: String) -> anyhow::Result<()> {
+        if let Some(inner) = &self.inner {
+            inner.update(spec)?;
+        }
+        Ok(())
+    }
+
    pub fn spawn(
        conf: &PageServerConf,
        shutdown_pageserver: CancellationToken,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -41,6 +41,7 @@ use pageserver_api::models::{
    TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
+use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
 use serde_json::json;
@@ -3385,7 +3386,7 @@ async fn put_tenant_timeline_import_basebackup(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+    let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?;

    check_permission(&request, Some(tenant_id))?;

@@ -3742,6 +3743,20 @@ async fn force_override_feature_flag_for_testing_delete(
    json_response(StatusCode::OK, ())
 }

+async fn update_feature_flag_spec(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let body = json_request(&mut request).await?;
+    let state = get_state(&request);
+    state
+        .feature_resolver
+        .update(body)
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, ())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -4127,5 +4142,8 @@ pub fn make_router(
        .delete("/v1/feature_flag/:flag_key", |r| {
            testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
        })
+        .post("/v1/feature_flag_spec", |r| {
+            api_handler(r, update_feature_flag_spec)
+        })
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -38,6 +38,7 @@ pub mod walredo;

 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
+use postgres_ffi::PgMajorVersion;
 use tenant::mgr::{BackgroundPurges, TenantManager};
 use tenant::secondary;
 use tracing::{info, info_span};
@@ -51,7 +52,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 17;
+pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4439,6 +4439,14 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_basebackup_cache_prepare_queue_size",
+        "Number of requests in the basebackup prepare channel"
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_config_ignored_items",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,7 +13,6 @@ use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

 use anyhow::{Context as _, anyhow, bail};
-use async_compression::tokio::write::GzipEncoder;
 use bytes::{Buf as _, BufMut as _, BytesMut};
 use futures::future::BoxFuture;
 use futures::{FutureExt, Stream};
@@ -63,7 +62,6 @@ use utils::{failpoint_support, span_record};

 use crate::auth::check_permission;
 use crate::basebackup::{self, BasebackupError};
-use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -138,7 +136,6 @@ pub fn spawn(
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
    tls_config: Option<Arc<rustls::ServerConfig>>,
-    basebackup_cache: Arc<BasebackupCache>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -160,7 +157,6 @@ pub fn spawn(
            conf.pg_auth_type,
            tls_config,
            conf.page_service_pipelining.clone(),
-            basebackup_cache,
            libpq_ctx,
            cancel.clone(),
        )
@@ -219,7 +215,6 @@ pub async fn libpq_listener_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
-    basebackup_cache: Arc<BasebackupCache>,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
 ) -> Connections {
@@ -263,7 +258,6 @@ pub async fn libpq_listener_main(
                    auth_type,
                    tls_config.clone(),
                    pipelining_config.clone(),
-                    Arc::clone(&basebackup_cache),
                    connection_ctx,
                    connections_cancel.child_token(),
                    gate_guard,
@@ -306,7 +300,6 @@ async fn page_service_conn_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
-    basebackup_cache: Arc<BasebackupCache>,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
    gate_guard: GateGuard,
@@ -372,7 +365,6 @@ async fn page_service_conn_main(
        pipelining_config,
        conf.get_vectored_concurrent_io,
        perf_span_fields,
-        basebackup_cache,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -426,8 +418,6 @@ struct PageServerHandler {
    pipelining_config: PageServicePipeliningConfig,
    get_vectored_concurrent_io: GetVectoredConcurrentIo,

-    basebackup_cache: Arc<BasebackupCache>,
-
    gate_guard: GateGuard,
 }

@@ -913,7 +903,6 @@ impl PageServerHandler {
        pipelining_config: PageServicePipeliningConfig,
        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
-        basebackup_cache: Arc<BasebackupCache>,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
@@ -927,7 +916,6 @@ impl PageServerHandler {
            cancel,
            pipelining_config,
            get_vectored_concurrent_io,
-            basebackup_cache,
            gate_guard,
        }
    }
@@ -2613,6 +2601,7 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
+                None,
                &ctx,
            )
            .await?;
@@ -2626,9 +2615,7 @@ impl PageServerHandler {
                    && lsn.is_some()
                    && prev_lsn.is_none()
                {
-                    self.basebackup_cache
-                        .get(tenant_id, timeline_id, lsn.unwrap())
-                        .await
+                    timeline.get_cached_basebackup(lsn.unwrap()).await
                } else {
                    None
                }
@@ -2641,31 +2628,6 @@ impl PageServerHandler {
                    .map_err(|err| {
                        BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
                    })?;
-            } else if gzip {
-                let mut encoder = GzipEncoder::with_quality(
-                    &mut writer,
-                    // NOTE using fast compression because it's on the critical path
-                    //      for compute startup. For an empty database, we get
-                    //      <100KB with this method. The Level::Best compression method
-                    //      gives us <20KB, but maybe we should add basebackup caching
-                    //      on compute shutdown first.
-                    async_compression::Level::Fastest,
-                );
-                basebackup::send_basebackup_tarball(
-                    &mut encoder,
-                    &timeline,
-                    lsn,
-                    prev_lsn,
-                    full_backup,
-                    replica,
-                    &ctx,
-                )
-                .await?;
-                // shutdown the encoder to ensure the gzip footer is written
-                encoder
-                    .shutdown()
-                    .await
-                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -2674,6 +2636,11 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    // NB: using fast compression because it's on the critical path for compute
+                    // startup. For an empty database, we get <100KB with this method. The
+                    // Level::Best compression method gives us <20KB, but maybe we should add
+                    // basebackup caching on compute shutdown first.
+                    gzip.then_some(async_compression::Level::Fastest),
                    &ctx,
                )
                .await?;
@@ -3553,7 +3520,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        if timeline.is_archived() == Some(true) {
            return Err(tonic::Status::failed_precondition("timeline is archived"));
        }
-        let req: page_api::GetBaseBackupRequest = req.into_inner().into();
+        let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;

        span_record!(lsn=?req.lsn);

@@ -3579,6 +3546,15 @@ impl proto::PageService for GrpcPageServiceHandler {
        let span = Span::current();
        let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
        let jh = tokio::spawn(async move {
+            let gzip_level = match req.compression {
+                page_api::BaseBackupCompression::None => None,
+                // NB: using fast compression because it's on the critical path for compute
+                // startup. For an empty database, we get <100KB with this method. The
+                // Level::Best compression method gives us <20KB, but maybe we should add
+                // basebackup caching on compute shutdown first.
+                page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
+            };
+
            let result = basebackup::send_basebackup_tarball(
                &mut simplex_write,
                &timeline,
@@ -3586,6 +3562,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                None,
                req.full,
                req.replica,
+                gzip_level,
                &ctx,
            )
            .instrument(span) // propagate request span
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -25,7 +25,7 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::{BLCKSZ, TimestampTz, TransactionId};
+use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
 use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi_types::{Oid, RepOriginId};
 use serde::{Deserialize, Serialize};
@@ -1081,7 +1081,7 @@ impl Timeline {
        // fetch directory entry
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;

-        if self.pg_version >= 17 {
+        if self.pg_version >= PgMajorVersion::PG17 {
            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
        } else {
            Ok(TwoPhaseDirectory::des(&buf)?
@@ -1613,7 +1613,7 @@ impl DatadirModification<'_> {
            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

-        let buf = if self.tline.pg_version >= 17 {
+        let buf = if self.tline.pg_version >= PgMajorVersion::PG17 {
            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                xids: HashSet::new(),
            })
@@ -1967,7 +1967,7 @@ impl DatadirModification<'_> {
    ) -> Result<(), WalIngestError> {
        // Add it to the directory entry
        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= 17 {
+        let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
@@ -2383,7 +2383,7 @@ impl DatadirModification<'_> {
    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= 17 {
+        let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;

            if !dir.xids.remove(&xid) {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -38,6 +38,7 @@ use pageserver_api::models::{
    WalRedoManagerStatus,
 };
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
+use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel};
 use remote_timeline_client::index::GcCompactionState;
 use remote_timeline_client::manifest::{
@@ -79,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
 use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
-use crate::basebackup_cache::BasebackupPrepareSender;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context;
 use crate::context::RequestContextBuilder;
@@ -161,7 +162,7 @@ pub struct TenantSharedResources {
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
-    pub basebackup_prepare_sender: BasebackupPrepareSender,
+    pub basebackup_cache: Arc<BasebackupCache>,
    pub feature_resolver: FeatureResolver,
 }

@@ -330,7 +331,7 @@ pub struct TenantShard {
    deletion_queue_client: DeletionQueueClient,

    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
-    basebackup_prepare_sender: BasebackupPrepareSender,
+    basebackup_cache: Arc<BasebackupCache>,

    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -497,7 +498,7 @@ impl WalRedoManager {
        lsn: Lsn,
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        redo_attempt_type: RedoAttemptType,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
@@ -933,7 +934,7 @@ pub(crate) enum CreateTimelineParams {
 pub(crate) struct CreateTimelineParamsBootstrap {
    pub(crate) new_timeline_id: TimelineId,
    pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
-    pub(crate) pg_version: u32,
+    pub(crate) pg_version: PgMajorVersion,
 }

 /// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
@@ -971,7 +972,7 @@ pub(crate) enum CreateTimelineIdempotency {
    /// NB: special treatment, see comment in [`Self`].
    FailWithConflict,
    Bootstrap {
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    },
    /// NB: branches always have the same `pg_version` as their ancestor.
    /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
@@ -1362,7 +1363,7 @@ impl TenantShard {
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        } = resources;

@@ -1379,7 +1380,7 @@ impl TenantShard {
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        ));

@@ -2541,7 +2542,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        ctx: &RequestContext,
    ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
        anyhow::ensure!(
@@ -2593,7 +2594,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let (uninit_tl, ctx) = self
@@ -2632,7 +2633,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        ctx: &RequestContext,
        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
@@ -2898,7 +2899,7 @@ impl TenantShard {
                    Lsn(0),
                    initdb_lsn,
                    initdb_lsn,
-                    15,
+                    PgMajorVersion::PG15,
                );
                this.prepare_new_timeline(
                    new_timeline_id,
@@ -4379,7 +4380,7 @@ impl TenantShard {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-        basebackup_prepare_sender: BasebackupPrepareSender,
+        basebackup_cache: Arc<BasebackupCache>,
        feature_resolver: FeatureResolver,
    ) -> TenantShard {
        assert!(!attached_conf.location.generation.is_none());
@@ -4484,7 +4485,7 @@ impl TenantShard {
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        }
    }
@@ -5090,7 +5091,7 @@ impl TenantShard {
    pub(crate) async fn bootstrap_timeline_test(
        self: &Arc<Self>,
        timeline_id: TimelineId,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -5232,7 +5233,7 @@ impl TenantShard {
    async fn bootstrap_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
@@ -5413,7 +5414,7 @@ impl TenantShard {
            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
-            basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
+            basebackup_cache: self.basebackup_cache.clone(),
            feature_resolver: self.feature_resolver.clone(),
        }
    }
@@ -5770,7 +5771,7 @@ impl TenantShard {
 async fn run_initdb(
    conf: &'static PageServerConf,
    initdb_target_dir: &Utf8Path,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
    cancel: &CancellationToken,
 ) -> Result<(), InitdbError> {
    let initdb_bin_path = conf
@@ -5999,7 +6000,7 @@ pub(crate) mod harness {
        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
+            let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);

            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
@@ -6017,7 +6018,7 @@ pub(crate) mod harness {
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
-                basebackup_requst_sender,
+                basebackup_cache,
                FeatureResolver::new_disabled(),
            ));

@@ -6051,7 +6052,7 @@ pub(crate) mod harness {
            lsn: Lsn,
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
-            _pg_version: u32,
+            _pg_version: PgMajorVersion,
            _redo_attempt_type: RedoAttemptType,
        ) -> Result<Bytes, walredo::Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
@@ -6223,7 +6224,7 @@ mod tests {
    async fn randomize_timeline(
        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        spec: TestTimelineSpecification,
        random: &mut rand::rngs::StdRng,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -18,6 +18,7 @@
 //! [`IndexPart`]: super::remote_timeline_client::index::IndexPart

 use anyhow::ensure;
+use postgres_ffi::PgMajorVersion;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::{BeSer, SerializeError};
 use utils::id::TimelineId;
@@ -136,7 +137,7 @@ struct TimelineMetadataBodyV2 {
    latest_gc_cutoff_lsn: Lsn,

    initdb_lsn: Lsn,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 }

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -167,7 +168,7 @@ impl TimelineMetadata {
        ancestor_lsn: Lsn,
        latest_gc_cutoff_lsn: Lsn,
        initdb_lsn: Lsn,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> Self {
        Self {
            hdr: TimelineMetadataHeader {
@@ -215,7 +216,7 @@ impl TimelineMetadata {
            ancestor_lsn: body.ancestor_lsn,
            latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
            initdb_lsn: body.initdb_lsn,
-            pg_version: 14, // All timelines created before this version had pg_version 14
+            pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
        };

        hdr.format_version = METADATA_FORMAT_VERSION;
@@ -317,7 +318,7 @@ impl TimelineMetadata {
        self.body.initdb_lsn
    }

-    pub fn pg_version(&self) -> u32 {
+    pub fn pg_version(&self) -> PgMajorVersion {
        self.body.pg_version
    }

@@ -331,7 +332,7 @@ impl TimelineMetadata {
            Lsn::from_hex("00000000").unwrap(),
            Lsn::from_hex("00000000").unwrap(),
            Lsn::from_hex("00000000").unwrap(),
-            0,
+            PgMajorVersion::PG14,
        );
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
@@ -545,7 +546,7 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            14, // All timelines created before this version had pg_version 14
+            PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
        );

        assert_eq!(
@@ -565,7 +566,7 @@ mod tests {
            Lsn(0),
            // Updating this version to 17 will cause the test to fail at the
            // next assert_eq!().
-            16,
+            PgMajorVersion::PG16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2891,14 +2891,18 @@ mod tests {
    use std::collections::BTreeMap;
    use std::sync::Arc;

+    use camino::Utf8PathBuf;
    use storage_broker::BrokerClientChannel;
    use tracing::Instrument;

    use super::super::harness::TenantHarness;
    use super::TenantsMap;
-    use crate::tenant::{
-        TenantSharedResources,
-        mgr::{BackgroundPurges, TenantManager, TenantSlot},
+    use crate::{
+        basebackup_cache::BasebackupCache,
+        tenant::{
+            TenantSharedResources,
+            mgr::{BackgroundPurges, TenantManager, TenantSlot},
+        },
    };

    #[tokio::test(start_paused = true)]
@@ -2924,9 +2928,7 @@ mod tests {
        // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
        // permit it to proceed: that will stick the tenant in InProgress

-        let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::<
-            crate::basebackup_cache::BasebackupPrepareRequest,
-        >();
+        let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);

        let tenant_manager = TenantManager {
            tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)),
@@ -2940,7 +2942,7 @@ mod tests {
                l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new(
                    h.conf.l0_flush.clone(),
                ),
-                basebackup_prepare_sender,
+                basebackup_cache,
                feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(),
            },
            cancel: tokio_util::sync::CancellationToken::new(),
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -427,8 +427,8 @@ impl GcBlocking {

 #[cfg(test)]
 mod tests {
+    use postgres_ffi::PgMajorVersion;
    use std::str::FromStr;
-
    use utils::id::TimelineId;

    use super::*;
@@ -831,7 +831,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
@@ -893,7 +893,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
@@ -957,7 +957,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1033,7 +1033,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1114,7 +1114,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1199,7 +1199,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1287,7 +1287,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                14,
+                PgMajorVersion::PG14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1622,11 +1622,6 @@ impl DeltaLayerIterator<'_> {
 pub(crate) mod test {
    use std::collections::BTreeMap;

-    use bytes::Bytes;
-    use itertools::MinMaxResult;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
-    use rand::{Rng, RngCore};
-
    use super::*;
    use crate::DEFAULT_PG_VERSION;
    use crate::context::DownloadBehavior;
@@ -1636,6 +1631,11 @@ pub(crate) mod test {
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
    use crate::tenant::{TenantShard, Timeline};
+    use bytes::Bytes;
+    use itertools::MinMaxResult;
+    use postgres_ffi::PgMajorVersion;
+    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::{Rng, RngCore};

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1995,7 +1995,7 @@ pub(crate) mod test {
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx)
            .await
            .unwrap();
        let ctx = &ctx.with_scope_timeline(&timeline);
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,6 +1,7 @@
 use std::time::UNIX_EPOCH;

 use pageserver_api::key::{CONTROLFILE_KEY, Key};
+use postgres_ffi::PgMajorVersion;
 use tokio::task::JoinSet;
 use utils::completion::{self, Completion};
 use utils::id::TimelineId;
@@ -45,7 +46,7 @@ async fn smoke_test() {
        .create_test_timeline_with_layers(
            TimelineId::generate(),
            Lsn(0x10),
-            14,
+            PgMajorVersion::PG14,
            &ctx,
            Default::default(), // in-memory layers
            Default::default(),
@@ -256,7 +257,12 @@ async fn evict_and_wait_on_wanted_deleted() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline(
+            TimelineId::generate(),
+            Lsn(0x10),
+            PgMajorVersion::PG14,
+            &ctx,
+        )
        .await
        .unwrap();

@@ -341,7 +347,12 @@ fn read_wins_pending_eviction() {
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .create_test_timeline(
+                TimelineId::generate(),
+                Lsn(0x10),
+                PgMajorVersion::PG14,
+                &ctx,
+            )
            .await
            .unwrap();
        let ctx = ctx.with_scope_timeline(&timeline);
@@ -474,7 +485,12 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .create_test_timeline(
+                TimelineId::generate(),
+                Lsn(0x10),
+                PgMajorVersion::PG14,
+                &ctx,
+            )
            .await
            .unwrap();
        let ctx = ctx.with_scope_timeline(&timeline);
@@ -644,7 +660,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline(
+            TimelineId::generate(),
+            Lsn(0x10),
+            PgMajorVersion::PG14,
+            &ctx,
+        )
        .await
        .unwrap();
    let ctx = ctx.with_scope_timeline(&timeline);
@@ -730,7 +751,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline(
+            TimelineId::generate(),
+            Lsn(0x10),
+            PgMajorVersion::PG14,
+            &ctx,
+        )
        .await
        .unwrap();
    let ctx = ctx.with_scope_timeline(&timeline);
@@ -836,7 +862,12 @@ async fn eviction_cancellation_on_drop() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline(
+            TimelineId::generate(),
+            Lsn(0x10),
+            PgMajorVersion::PG14,
+            &ctx,
+        )
        .await
        .unwrap();

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::v14::xlog_utils;
-use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
+use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp};
 use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
@@ -95,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
 use super::upload_queue::NotInitialized;
 use super::{
-    AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
+    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
 use crate::PERF_TRACE_TARGET;
 use crate::aux_file::AuxFileSizeEstimator;
-use crate::basebackup_cache::BasebackupPrepareRequest;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -201,7 +201,7 @@ pub struct TimelineResources {
    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
    pub l0_compaction_trigger: Arc<Notify>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
-    pub basebackup_prepare_sender: BasebackupPrepareSender,
+    pub basebackup_cache: Arc<BasebackupCache>,
    pub feature_resolver: FeatureResolver,
 }

@@ -225,7 +225,7 @@ pub struct Timeline {
    /// to shards, and is constant through the lifetime of this Timeline.
    shard_identity: ShardIdentity,

-    pub pg_version: u32,
+    pub pg_version: PgMajorVersion,

    /// The tuple has two elements.
    /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
@@ -448,7 +448,7 @@ pub struct Timeline {
    wait_lsn_log_slow: tokio::sync::Semaphore,

    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
-    basebackup_prepare_sender: BasebackupPrepareSender,
+    basebackup_cache: Arc<BasebackupCache>,

    feature_resolver: FeatureResolver,
 }
@@ -2500,6 +2500,13 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
    }

+    /// Try to get a basebackup from the on-disk cache.
+    pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
+        self.basebackup_cache
+            .get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
+            .await
+    }
+
    /// Prepare basebackup for the given LSN and store it in the basebackup cache.
    /// The method is asynchronous and returns immediately.
    /// The actual basebackup preparation is performed in the background
@@ -2521,17 +2528,8 @@ impl Timeline {
            return;
        }

-        let res = self
-            .basebackup_prepare_sender
-            .send(BasebackupPrepareRequest {
-                tenant_shard_id: self.tenant_shard_id,
-                timeline_id: self.timeline_id,
-                lsn,
-            });
-        if let Err(e) = res {
-            // May happen during shutdown, it's not critical.
-            info!("Failed to send shutdown checkpoint: {e:#}");
-        }
+        self.basebackup_cache
+            .send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
    }
 }

@@ -2913,7 +2911,7 @@ impl Timeline {
        shard_identity: ShardIdentity,
        walredo_mgr: Option<Arc<super::WalRedoManager>>,
        resources: TimelineResources,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        state: TimelineState,
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        create_idempotency: crate::tenant::CreateTimelineIdempotency,
@@ -3088,7 +3086,7 @@ impl Timeline {

                wait_lsn_log_slow: tokio::sync::Semaphore::new(1),

-                basebackup_prepare_sender: resources.basebackup_prepare_sender,
+                basebackup_cache: resources.basebackup_cache,

                feature_resolver: resources.feature_resolver,
            };
@@ -7593,6 +7591,7 @@ mod tests {
    use std::sync::Arc;

    use pageserver_api::key::Key;
+    use postgres_ffi::PgMajorVersion;
    use std::iter::Iterator;
    use tracing::Instrument;
    use utils::id::TimelineId;
@@ -7667,7 +7666,7 @@ mod tests {
            .create_test_timeline_with_layers(
                TimelineId::generate(),
                Lsn(0x10),
-                14,
+                PgMajorVersion::PG14,
                &ctx,
                Vec::new(), // in-memory layers
                delta_layers,
@@ -7803,7 +7802,7 @@ mod tests {
            .create_test_timeline_with_layers(
                TimelineId::generate(),
                Lsn(0x10),
-                14,
+                PgMajorVersion::PG14,
                &ctx,
                Vec::new(), // in-memory layers
                delta_layers,
@@ -7863,7 +7862,12 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .create_test_timeline(
+                TimelineId::generate(),
+                Lsn(0x10),
+                PgMajorVersion::PG14,
+                &ctx,
+            )
            .await
            .unwrap();

--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;

 use anyhow::Context;
 use bytes::Bytes;
-use postgres_ffi::ControlFileData;
+use postgres_ffi::{ControlFileData, PgMajorVersion};
 use remote_storage::{
    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
    ListingObject, RemotePath, RemoteStorageConfig,
@@ -264,7 +264,7 @@ impl ControlFile {
    pub(crate) fn base_lsn(&self) -> Lsn {
        Lsn(self.control_file_data.checkPoint).align()
    }
-    pub(crate) fn pg_version(&self) -> u32 {
+    pub(crate) fn pg_version(&self) -> PgMajorVersion {
        self.try_pg_version()
            .expect("prepare() checks that try_pg_version doesn't error")
    }
@@ -274,13 +274,14 @@ impl ControlFile {
    pub(crate) fn control_file_buf(&self) -> &Bytes {
        &self.control_file_buf
    }
-    fn try_pg_version(&self) -> anyhow::Result<u32> {
+
+    fn try_pg_version(&self) -> anyhow::Result<PgMajorVersion> {
        Ok(match self.control_file_data.catalog_version_no {
            // thesea are from catversion.h
-            202107181 => 14,
-            202209061 => 15,
-            202307071 => 16,
-            202406281 => 17,
+            202107181 => PgMajorVersion::PG14,
+            202209061 => PgMajorVersion::PG15,
+            202307071 => PgMajorVersion::PG16,
+            202406281 => PgMajorVersion::PG17,
            catversion => {
                anyhow::bail!("unrecognized catalog version {catversion}")
            }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
    let copy_stream = replication_client.copy_both_simple(&query).await?;
    let mut physical_stream = pin!(ReplicationStream::new(copy_stream));

-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
-        .await
-        .map_err(|e| match e.kind {
-            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
-            _ => WalReceiverError::Other(e.into()),
-        })?;
+    let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
+    let walingest_res = select! {
+        walingest_res = walingest_future => walingest_res,
+        _ = cancellation.cancelled() => {
+            // We are doing reads in WalIngest::new, and those can hang as they come from the network.
+            // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
+            debug!("Connection cancelled");
+            return Err(WalReceiverError::Cancelled);
+        },
+    };
+    let mut walingest = walingest_res.map_err(|e| match e.kind {
+        crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+        _ => WalReceiverError::Other(e.into()),
+    })?;

    let (format, compression) = match protocol {
        PostgresClientProtocol::Interpreted {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -32,8 +32,8 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{
-    TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
-    fsm_logical_to_physical, pg_constants,
+    PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
+    enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
 };
 use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use tracing::*;
@@ -781,7 +781,7 @@ impl WalIngest {
    ) -> Result<(), WalIngestError> {
        let (xact_common, is_commit, is_prepared) = match record {
            XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
-                let xid: u64 = if modification.tline.pg_version >= 17 {
+                let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
                    self.adjust_to_full_transaction_id(xl_xid)?
                } else {
                    xl_xid as u64
@@ -886,7 +886,7 @@ impl WalIngest {
                xl_xid, parsed.xid, lsn,
            );

-            let xid: u64 = if modification.tline.pg_version >= 17 {
+            let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
                self.adjust_to_full_transaction_id(parsed.xid)?
            } else {
                parsed.xid as u64
@@ -1241,7 +1241,7 @@ impl WalIngest {
                if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
                    && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
                {
-                    let oldest_active_xid = if pg_version >= 17 {
+                    let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 {
                        let mut oldest_active_full_xid = cp.nextXid.value;
                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
                            if xid < oldest_active_full_xid {
@@ -1475,10 +1475,11 @@ impl WalIngest {

                    const fn rate_limiter(
                        &self,
-                        pg_version: u32,
+                        pg_version: PgMajorVersion,
                    ) -> Option<&Lazy<Mutex<RateLimit>>> {
-                        const MIN_PG_VERSION: u32 = 14;
-                        const MAX_PG_VERSION: u32 = 17;
+                        const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num();
+                        const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num();
+                        let pg_version = pg_version.major_version_num();

                        if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
                            return None;
@@ -1603,6 +1604,7 @@ async fn get_relsize(
 #[cfg(test)]
 mod tests {
    use anyhow::Result;
+    use postgres_ffi::PgMajorVersion;
    use postgres_ffi::RELSEG_SIZE;

    use super::*;
@@ -1625,7 +1627,7 @@ mod tests {

    #[tokio::test]
    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
-        for i in 14..=16 {
+        for i in PgMajorVersion::ALL {
            dispatch_pgversion!(i, {
                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
            });
@@ -2335,7 +2337,7 @@ mod tests {
        // 5. Grep sk logs for "restart decoder" to get startpoint
        // 6. Run just the decoder from this test to get the endpoint.
        //    It's the last LSN the decoder will output.
-        let pg_version = 15; // The test data was generated by pg15
+        let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15
        let path = "test_data/sk_wal_segment_from_pgbench";
        let wal_segment_path = format!("{path}/000000010000000000000001.zst");
        let source_initdb_path = format!("{path}/{INITDB_PATH}");
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -33,6 +33,7 @@ use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
+use postgres_ffi::PgMajorVersion;
 use tracing::*;
 use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
@@ -165,7 +166,7 @@ impl PostgresRedoManager {
        lsn: Lsn,
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        redo_attempt_type: RedoAttemptType,
    ) -> Result<Bytes, Error> {
        if records.is_empty() {
@@ -232,7 +233,7 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
+    pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> {
        self.do_with_walredo_process(pg_version, |proc| async move {
            proc.ping(Duration::from_secs(1))
                .await
@@ -342,7 +343,7 @@ impl PostgresRedoManager {
        O,
    >(
        &self,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        closure: F,
    ) -> Result<O, Error> {
        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
@@ -442,7 +443,7 @@ impl PostgresRedoManager {
        base_img_lsn: Lsn,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        max_retry_attempts: u32,
    ) -> Result<Bytes, Error> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
@@ -572,6 +573,7 @@ mod tests {
    use bytes::Bytes;
    use pageserver_api::key::Key;
    use pageserver_api::shard::TenantShardId;
+    use postgres_ffi::PgMajorVersion;
    use tracing::Instrument;
    use utils::id::TenantId;
    use utils::lsn::Lsn;
@@ -586,7 +588,7 @@ mod tests {
        let h = RedoHarness::new().unwrap();

        h.manager
-            .ping(14)
+            .ping(PgMajorVersion::PG14)
            .instrument(h.span())
            .await
            .expect("ping should work");
@@ -612,7 +614,7 @@ mod tests {
                Lsn::from_str("0/16E2408").unwrap(),
                None,
                short_records(),
-                14,
+                PgMajorVersion::PG14,
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
@@ -641,7 +643,7 @@ mod tests {
                Lsn::from_str("0/16E2408").unwrap(),
                None,
                short_records(),
-                14,
+                PgMajorVersion::PG14,
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
@@ -663,7 +665,7 @@ mod tests {
                Lsn::INVALID,
                None,
                short_records(),
-                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+                PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -12,7 +12,7 @@ use anyhow::Context;
 use bytes::Bytes;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
-use postgres_ffi::BLCKSZ;
+use postgres_ffi::{BLCKSZ, PgMajorVersion};
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tracing::{Instrument, debug, error, instrument};
 use utils::lsn::Lsn;
@@ -54,11 +54,11 @@ impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
+    #[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))]
    pub(crate) fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<Self> {
        crate::span::debug_assert_current_span_has_tenant_id();

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1295,7 +1295,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (iteration_hits != 0)
 		{
-			/* chunk offset (# of pages) into the LFC file */
+			/* chunk offset (#
+			   of pages) into the LFC file */
 			off_t	first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
 			int		nwrite = iov_last_used - first_block_in_chunk_read;
 			/* offset of first IOV */
@@ -1313,16 +1314,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				lfc_disable("read");
 				return -1;
 			}
-
-			/*
-			 * We successfully read the pages we know were valid when we
-			 * started reading; now mark those pages as read
-			 */
-			for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
-			{
-				if (BITMAP_ISSET(chunk_mask, i))
-					BITMAP_SET(mask, buf_offset + i);
-			}
 		}

 		/* Place entry to the head of LRU list */
@@ -1340,6 +1331,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			{
 				lfc_ctl->time_read += io_time_us;
 				inc_page_cache_read_wait(io_time_us);
+				/*
+				 * We successfully read the pages we know were valid when we
+				 * started reading; now mark those pages as read
+				 */
+				for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+				{
+					if (BITMAP_ISSET(chunk_mask, i))
+						BITMAP_SET(mask, buf_offset + i);
+				}
 			}

 			CriticalAssert(entry->access_count > 0);
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -159,14 +159,18 @@ static XLogReaderState *reader_state;
 * If the Linux uAPI headers don't define the system call number,
 * fail the build deliberately rather than ifdef'ing it to ENOSYS.
 * We prefer a compile time over a runtime error for walredo.
- *
- * If, however, we need to build on old systems for development, e.g. Ubuntu 20.04
- * with glibc 2.31, provide a NO_CLOSE_RANGE macro for suboptimal implementation
 */
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <errno.h>

+static int
+close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags)
+{
+    return syscall(__NR_close_range, start_fd, count, flags);
+}
+
+
 static PgSeccompRule allowed_syscalls[] =
 {
 	/* Hard requirements */
@@ -209,30 +213,10 @@ enter_seccomp_mode(void)
 	 * it potentially leaked to us, _before_ we start processing potentially dangerous
 	 * wal records. See the comment in the Rust code that launches this process.
 	 */
-#define START_FD 3
-#define STR(s) #s
-#ifdef __NR_close_range
-	if (syscall(__NR_close_range, START_FD, ~0U, 0) != 0)
+	if (close_range_syscall(3, ~0U, 0) != 0)
 		ereport(FATAL,
 				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not close files >= " STR(START_FD))));
-#else
-	// close_range can return EINVAL -- not our case as start_fd and end_fd are hardcoded.
-	// It doesn't return any other errors if CLOSE_RANGE_UNSHARE is not set so we don't
-	// report any errors here.
-	#ifdef NO_CLOSE_RANGE
-		#warning
-			"__NR_close_range is not defined which means you're using kernel < 5.9."
-			"Using suboptimal implementation. Do NOT use this in production"
-		int fd;
-		for (fd = START_FD; fd <= INT_MAX; ++fd)
-			close(fd);
-	#else
-		#error
-			"__NR_close_range is not defined which means you're using kernel < 5.9."
-			"Define NO_CLOSE_RANGE for local development"
-	#endif
-#endif
+				 errmsg("seccomp: could not close files >= fd 3")));

 #ifdef MALLOC_NO_MMAP
 	/* Ask glibc not to use mmap() */
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -6,7 +6,6 @@ use std::collections::BTreeMap;
 use std::pin::pin;
 use std::sync::Mutex;

-use futures::future::Either;
 use scopeguard::ScopeGuard;
 use tokio::sync::oneshot::error::TryRecvError;

@@ -49,37 +48,67 @@ impl<P: QueueProcessing> BatchQueue<P> {
        }
    }

-    pub async fn call(&self, req: P::Req) -> P::Res {
+    /// Perform a single request-response process, this may be batched internally.
+    ///
+    /// This function is not cancel safe.
+    pub async fn call<R>(
+        &self,
+        req: P::Req,
+        cancelled: impl Future<Output = R>,
+    ) -> Result<P::Res, R> {
        let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
-        let guard = scopeguard::guard(id, move |id| {
-            let mut inner = self.inner.lock_propagate_poison();
-            if inner.queue.remove(&id).is_some() {
-                tracing::debug!("batched task cancelled before completion");
-            }
-        });

+        let mut cancelled = pin!(cancelled);
        let resp = loop {
            // try become the leader, or try wait for success.
-            let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await
-            {
-                // we got the resp.
-                Either::Left((resp, _)) => break resp.ok(),
-                // we are the leader.
-                Either::Right((p, rx_)) => {
-                    rx = rx_;
-                    p
-                }
+            let mut processor = tokio::select! {
+                // try become leader.
+                p = self.processor.lock() => p,
+                // wait for success.
+                resp = &mut rx => break resp.ok(),
+                // wait for cancellation.
+                cancel = cancelled.as_mut() => {
+                    let mut inner = self.inner.lock_propagate_poison();
+                    if inner.queue.remove(&id).is_some() {
+                        tracing::warn!("batched task cancelled before completion");
+                    }
+                    return Err(cancel);
+                },
            };

+            tracing::debug!(id, "batch: became leader");
            let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor);

+            // snitch incase the task gets cancelled.
+            let cancel_safety = scopeguard::guard((), |()| {
+                if !std::thread::panicking() {
+                    tracing::error!(
+                        id,
+                        "batch: leader cancelled, despite not being cancellation safe"
+                    );
+                }
+            });
+
            // apply a batch.
+            // if this is cancelled, jobs will not be completed and will panic.
            let values = processor.apply(reqs).await;

+            // good: we didn't get cancelled.
+            ScopeGuard::into_inner(cancel_safety);
+
+            if values.len() != resps.len() {
+                tracing::error!(
+                    "batch: invalid response size, expected={}, got={}",
+                    resps.len(),
+                    values.len()
+                );
+            }
+
            // send response values.
            for (tx, value) in std::iter::zip(resps, values) {
-                // sender hung up but that's fine.
-                drop(tx.send(value));
+                if tx.send(value).is_err() {
+                    // receiver hung up but that's fine.
+                }
            }

            match rx.try_recv() {
@@ -98,10 +127,9 @@ impl<P: QueueProcessing> BatchQueue<P> {
            }
        };

-        // already removed.
-        ScopeGuard::into_inner(guard);
+        tracing::debug!(id, "batch: job completed");

-        resp.expect("no response found. batch processer should not panic")
+        Ok(resp.expect("no response found. batch processer should not panic"))
    }
 }

@@ -125,6 +153,8 @@ impl<P: QueueProcessing> BatchQueueInner<P> {

        self.queue.insert(id, BatchJob { req, res: tx });

+        tracing::debug!(id, "batch: registered job in the queue");
+
        (id, rx)
    }

@@ -132,15 +162,19 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
        let batch_size = p.batch_size(self.queue.len());
        let mut reqs = Vec::with_capacity(batch_size);
        let mut resps = Vec::with_capacity(batch_size);
+        let mut ids = Vec::with_capacity(batch_size);

        while reqs.len() < batch_size {
-            let Some((_, job)) = self.queue.pop_first() else {
+            let Some((id, job)) = self.queue.pop_first() else {
                break;
            };
            reqs.push(job.req);
            resps.push(job.res);
+            ids.push(id);
        }

+        tracing::debug!(ids=?ids, "batch: acquired jobs");
+
        (reqs, resps)
    }
 }
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -279,7 +279,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
        },
        proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
        handshake_timeout: Duration::from_secs(10),
-        region: "local".into(),
        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
        connect_compute_locks,
        connect_to_compute: compute_config,
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -236,7 +236,6 @@ pub(super) async fn task_main(
                        extra: None,
                    },
                    crate::metrics::Protocol::SniRouter,
-                    "sni",
                );
                handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await
            }
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -123,12 +123,6 @@ struct ProxyCliArgs {
    /// timeout for the TLS handshake
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    handshake_timeout: tokio::time::Duration,
-    /// http endpoint to receive periodic metric updates
-    #[clap(long)]
-    metric_collection_endpoint: Option<String>,
-    /// how often metrics should be sent to a collection endpoint
-    #[clap(long)]
-    metric_collection_interval: Option<String>,
    /// cache for `wake_compute` api method (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    wake_compute_cache: String,
@@ -155,40 +149,31 @@ struct ProxyCliArgs {
    /// Wake compute rate limiter max number of requests per second.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
    wake_compute_limit: Vec<RateBucketInfo>,
-    /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
-    redis_rps_limit: Vec<RateBucketInfo>,
    /// Cancellation channel size (max queue size for redis kv client)
    #[clap(long, default_value_t = 1024)]
    cancellation_ch_size: usize,
    /// Cancellation ops batch size for redis
    #[clap(long, default_value_t = 8)]
    cancellation_batch_size: usize,
-    /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
-    allowed_ips_cache: String,
-    /// cache for `role_secret` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
-    role_secret_cache: String,
-    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
-    #[clap(long)]
-    redis_notifications: Option<String>,
-    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
+    /// redis url for plain authentication
+    #[clap(long, alias("redis-notifications"))]
+    redis_plain: Option<String>,
+    /// what from the available authentications type to use for redis. Supported are "irsa" and "plain".
    #[clap(long, default_value = "irsa")]
    redis_auth_type: String,
-    /// redis host for streaming connections (might be different from the notifications host)
+    /// redis host for irsa authentication
    #[clap(long)]
    redis_host: Option<String>,
-    /// redis port for streaming connections (might be different from the notifications host)
+    /// redis port for irsa authentication
    #[clap(long)]
    redis_port: Option<u16>,
-    /// redis cluster name, used in aws elasticache
+    /// redis cluster name for irsa authentication
    #[clap(long)]
    redis_cluster_name: Option<String>,
-    /// redis user_id, used in aws elasticache
+    /// redis user_id for irsa authentication
    #[clap(long)]
    redis_user_id: Option<String>,
-    /// aws region to retrieve credentials
+    /// aws region for irsa authentication
    #[clap(long, default_value_t = String::new())]
    aws_region: String,
    /// cache for `project_info` (use `size=0` to disable)
@@ -200,6 +185,12 @@ struct ProxyCliArgs {
    #[clap(flatten)]
    parquet_upload: ParquetUploadArgs,

+    /// http endpoint to receive periodic metric updates
+    #[clap(long)]
+    metric_collection_endpoint: Option<String>,
+    /// how often metrics should be sent to a collection endpoint
+    #[clap(long)]
+    metric_collection_interval: Option<String>,
    /// interval for backup metric collection
    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
    metric_backup_collection_interval: std::time::Duration,
@@ -212,6 +203,7 @@ struct ProxyCliArgs {
    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
    #[clap(long, default_value = "4194304")]
    metric_backup_collection_chunk_size: usize,
+
    /// Whether to retry the connection to the compute node
    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
    connect_to_compute_retry: String,
@@ -331,7 +323,7 @@ pub async fn run() -> anyhow::Result<()> {
        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
    }
    info!("Using region: {}", args.aws_region);
-    let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?;
+    let redis_client = configure_redis(&args).await?;

    // Check that we can bind to address before further initialization
    info!("Starting http on {}", args.http);
@@ -386,13 +378,6 @@ pub async fn run() -> anyhow::Result<()> {

    let cancellation_token = CancellationToken::new();

-    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
-    RateBucketInfo::validate(redis_rps_limit)?;
-
-    let redis_kv_client = regional_redis_client
-        .as_ref()
-        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
-
    let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute));

    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
@@ -472,6 +457,7 @@ pub async fn run() -> anyhow::Result<()> {
    client_tasks.spawn(crate::context::parquet::worker(
        cancellation_token.clone(),
        args.parquet_upload,
+        args.region,
    ));

    // maintenance tasks. these never return unless there's an error
@@ -495,32 +481,17 @@ pub async fn run() -> anyhow::Result<()> {
    #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))]
    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend {
        if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
+            if let Some(client) = redis_client {
+                // project info cache and invalidation of that cache.
+                let cache = api.caches.project_info.clone();
+                maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });

-            // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
-            // This prevents immediate exit and pod restart,
-            // which can cause hammering of the redis in case of connection issues.
-            if let Some(mut redis_kv_client) = redis_kv_client {
+                // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
+                // This prevents immediate exit and pod restart,
+                // which can cause hammering of the redis in case of connection issues.
+                // cancellation key management
+                let mut redis_kv_client = RedisKVClient::new(client.clone());
                for attempt in (0..3).with_position() {
                    match redis_kv_client.try_connect().await {
                        Ok(()) => {
@@ -545,14 +516,12 @@ pub async fn run() -> anyhow::Result<()> {
                        }
                    }
                }
-            }

-            if let Some(regional_redis_client) = regional_redis_client {
+                // listen for notifications of new projects/endpoints/branches
                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
                let span = tracing::info_span!("endpoints_cache");
                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                    async move { cache.do_read(client, cancellation_token.clone()).await }
                        .instrument(span),
                );
            }
@@ -681,7 +650,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        authentication_config,
        proxy_protocol_v2: args.proxy_protocol_v2,
        handshake_timeout: args.handshake_timeout,
-        region: args.region.clone(),
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
        connect_compute_locks,
        connect_to_compute: compute_config,
@@ -843,21 +811,18 @@ fn build_auth_backend(

 async fn configure_redis(
    args: &ProxyCliArgs,
-) -> anyhow::Result<(
-    Option<ConnectionWithCredentialsProvider>,
-    Option<ConnectionWithCredentialsProvider>,
-)> {
+) -> anyhow::Result<Option<ConnectionWithCredentialsProvider>> {
    // TODO: untangle the config args
-    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
-        ("plain", redis_url) => match redis_url {
+    let redis_client = match &*args.redis_auth_type {
+        "plain" => match &args.redis_plain {
            None => {
-                bail!("plain auth requires redis_notifications to be set");
+                bail!("plain auth requires redis_plain to be set");
            }
            Some(url) => {
                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
            }
        },
-        ("irsa", _) => match (&args.redis_host, args.redis_port) {
+        "irsa" => match (&args.redis_host, args.redis_port) {
            (Some(host), Some(port)) => Some(
                ConnectionWithCredentialsProvider::new_with_credentials_provider(
                    host.clone(),
@@ -881,18 +846,12 @@ async fn configure_redis(
                bail!("redis-host and redis-port must be specified together");
            }
        },
-        _ => {
-            bail!("unknown auth type given");
+        auth_type => {
+            bail!("unknown auth type {auth_type:?} given")
        }
    };

-    let redis_notifications_client = if let Some(url) = &args.redis_notifications {
-        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url))
-    } else {
-        regional_redis_client.clone()
-    };
-
-    Ok((regional_redis_client, redis_notifications_client))
+    Ok(redis_client)
 }

 #[cfg(test)]
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,5 +1,6 @@
 use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
+use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;

@@ -98,7 +99,6 @@ impl Pipeline {

 impl CancelKeyOp {
    fn register(&self, pipe: &mut Pipeline) {
-        #[allow(clippy::used_underscore_binding)]
        match self {
            CancelKeyOp::StoreCancelKey { key, value, expire } => {
                let key = KeyPrefix::Cancel(*key).build_redis_key();
@@ -224,6 +224,7 @@ impl CancellationHandler {
        }
    }

+    /// This is not cancel safe
    async fn get_cancel_key(
        &self,
        key: CancelKeyData,
@@ -240,16 +241,21 @@ impl CancellationHandler {
        };

        const TIMEOUT: Duration = Duration::from_secs(5);
-        let result = timeout(TIMEOUT, tx.call((guard, op)))
-            .await
-            .map_err(|_| {
-                tracing::warn!("timed out waiting to receive GetCancelData response");
-                CancelError::RateLimit
-            })?
-            .map_err(|e| {
-                tracing::warn!("failed to receive GetCancelData response: {e}");
-                CancelError::InternalError
-            })?;
+        let result = timeout(
+            TIMEOUT,
+            tx.call((guard, op), std::future::pending::<Infallible>()),
+        )
+        .await
+        .map_err(|_| {
+            tracing::warn!("timed out waiting to receive GetCancelData response");
+            CancelError::RateLimit
+        })?
+        // cannot be cancelled
+        .unwrap_or_else(|x| match x {})
+        .map_err(|e| {
+            tracing::warn!("failed to receive GetCancelData response: {e}");
+            CancelError::InternalError
+        })?;

        let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| {
            tracing::warn!("failed to receive GetCancelData response: {e}");
@@ -271,6 +277,8 @@ impl CancellationHandler {
    /// Will fetch IP allowlist internally.
    ///
    /// return Result primarily for tests
+    ///
+    /// This is not cancel safe
    pub(crate) async fn cancel_session<T: ControlPlaneApi>(
        &self,
        key: CancelKeyData,
@@ -394,6 +402,8 @@ impl Session {

    /// Ensure the cancel key is continously refreshed,
    /// but stop when the channel is dropped.
+    ///
+    /// This is not cancel safe
    pub(crate) async fn maintain_cancel_key(
        &self,
        session_id: uuid::Uuid,
@@ -401,27 +411,6 @@ impl Session {
        cancel_closure: &CancelClosure,
        compute_config: &ComputeConfig,
    ) {
-        futures::future::select(
-            std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)),
-            cancel,
-        )
-        .await;
-
-        if let Err(err) = cancel_closure
-            .try_cancel_query(compute_config)
-            .boxed()
-            .await
-        {
-            tracing::warn!(
-                ?session_id,
-                ?err,
-                "could not cancel the query in the database"
-            );
-        }
-    }
-
-    // Ensure the cancel key is continously refreshed.
-    async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! {
        let Some(tx) = self.cancellation_handler.tx.get() else {
            tracing::warn!("cancellation handler is not available");
            // don't exit, as we only want to exit if cancelled externally.
@@ -432,6 +421,8 @@ impl Session {
            .expect("serialising to json string should not fail")
            .into_boxed_str();

+        let mut cancel = pin!(cancel);
+
        loop {
            let guard = Metrics::get()
                .proxy
@@ -449,9 +440,35 @@ impl Session {
                "registering cancellation key"
            );

-            if tx.call((guard, op)).await.is_ok() {
-                tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+            match tx.call((guard, op), cancel.as_mut()).await {
+                Ok(Ok(_)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registered cancellation key"
+                    );
+
+                    // wait before continuing.
+                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                }
+                // retry immediately.
+                Ok(Err(error)) => {
+                    tracing::warn!(?error, "error registering cancellation key");
+                }
+                Err(Err(_cancelled)) => break,
            }
        }
+
+        if let Err(err) = cancel_closure
+            .try_cancel_query(compute_config)
+            .boxed()
+            .await
+        {
+            tracing::warn!(
+                ?session_id,
+                ?err,
+                "could not cancel the query in the database"
+            );
+        }
    }
 }
--- a/proxy/src/compute/mod.rs
+++ b/proxy/src/compute/mod.rs
@@ -6,7 +6,7 @@ use std::net::{IpAddr, SocketAddr};

 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use postgres_client::config::{AuthKeys, SslMode};
+use postgres_client::config::{AuthKeys, ChannelBinding, SslMode};
 use postgres_client::maybe_tls_stream::MaybeTlsStream;
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::{NoTls, RawCancelToken, RawConnection};
@@ -33,12 +33,51 @@ use crate::types::Host;
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";

 #[derive(Debug, Error)]
-pub(crate) enum ConnectionError {
+pub(crate) enum PostgresError {
    /// This error doesn't seem to reveal any secrets; for instance,
    /// `postgres_client::error::Kind` doesn't contain ip addresses and such.
    #[error("{COULD_NOT_CONNECT}: {0}")]
    Postgres(#[from] postgres_client::Error),
+}

+impl UserFacingError for PostgresError {
+    fn to_string_client(&self) -> String {
+        match self {
+            // This helps us drop irrelevant library-specific prefixes.
+            // TODO: propagate severity level and other parameters.
+            PostgresError::Postgres(err) => match err.as_db_error() {
+                Some(err) => {
+                    let msg = err.message();
+
+                    if msg.starts_with("unsupported startup parameter: ")
+                        || msg.starts_with("unsupported startup parameter in options: ")
+                    {
+                        format!(
+                            "{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter"
+                        )
+                    } else {
+                        msg.to_owned()
+                    }
+                }
+                None => err.to_string(),
+            },
+        }
+    }
+}
+
+impl ReportableError for PostgresError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            PostgresError::Postgres(e) if e.as_db_error().is_some() => {
+                crate::error::ErrorKind::Postgres
+            }
+            PostgresError::Postgres(_) => crate::error::ErrorKind::Compute,
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub(crate) enum ConnectionError {
    #[error("{COULD_NOT_CONNECT}: {0}")]
    TlsError(#[from] TlsError),

@@ -52,22 +91,6 @@ pub(crate) enum ConnectionError {
 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
        match self {
-            // This helps us drop irrelevant library-specific prefixes.
-            // TODO: propagate severity level and other parameters.
-            ConnectionError::Postgres(err) => match err.as_db_error() {
-                Some(err) => {
-                    let msg = err.message();
-
-                    if msg.starts_with("unsupported startup parameter: ")
-                        || msg.starts_with("unsupported startup parameter in options: ")
-                    {
-                        format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter")
-                    } else {
-                        msg.to_owned()
-                    }
-                }
-                None => err.to_string(),
-            },
            ConnectionError::WakeComputeError(err) => err.to_string_client(),
            ConnectionError::TooManyConnectionAttempts(_) => {
                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
@@ -80,10 +103,6 @@ impl UserFacingError for ConnectionError {
 impl ReportableError for ConnectionError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
        match self {
-            ConnectionError::Postgres(e) if e.as_db_error().is_some() => {
-                crate::error::ErrorKind::Postgres
-            }
-            ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
            ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
            ConnectionError::WakeComputeError(e) => e.get_error_kind(),
            ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
@@ -110,6 +129,8 @@ pub(crate) struct AuthInfo {
    auth: Option<Auth>,
    server_params: StartupMessageParams,

+    channel_binding: ChannelBinding,
+
    /// Console redirect sets user and database, we shouldn't re-use those from the params.
    skip_db_user: bool,
 }
@@ -133,6 +154,8 @@ impl AuthInfo {
            auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())),
            server_params,
            skip_db_user: true,
+            // pg-sni-router is a mitm so this would fail.
+            channel_binding: ChannelBinding::Disable,
        }
    }

@@ -146,6 +169,7 @@ impl AuthInfo {
            },
            server_params: StartupMessageParams::default(),
            skip_db_user: false,
+            channel_binding: ChannelBinding::Prefer,
        }
    }
 }
@@ -168,6 +192,7 @@ impl AuthInfo {
            Some(Auth::Password(pw)) => config.password(pw),
            None => &mut config,
        };
+        config.channel_binding(self.channel_binding);
        for (k, v) in self.server_params.iter() {
            config.set_param(k, v);
        }
@@ -206,6 +231,56 @@ impl AuthInfo {
            }
        }
    }
+
+    pub async fn authenticate(
+        &self,
+        ctx: &RequestContext,
+        compute: &mut ComputeConnection,
+        user_info: ComputeUserInfo,
+    ) -> Result<PostgresSettings, PostgresError> {
+        // client config with stubbed connect info.
+        // TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely,
+        // utilising pqproto.rs.
+        let mut tmp_config = postgres_client::Config::new(String::new(), 0);
+        // We have already established SSL if necessary.
+        tmp_config.ssl_mode(SslMode::Disable);
+        let tmp_config = self.enrich(tmp_config);
+
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let connection = tmp_config
+            .tls_and_authenticate(&mut compute.stream, NoTls)
+            .await?;
+        drop(pause);
+
+        let RawConnection {
+            stream: _,
+            parameters,
+            delayed_notice,
+            process_id,
+            secret_key,
+        } = connection;
+
+        tracing::Span::current().record("pid", tracing::field::display(process_id));
+
+        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
+        // Yet another reason to rework the connection establishing code.
+        let cancel_closure = CancelClosure::new(
+            compute.socket_addr,
+            RawCancelToken {
+                ssl_mode: compute.ssl_mode,
+                process_id,
+                secret_key,
+            },
+            compute.hostname.to_string(),
+            user_info,
+        );
+
+        Ok(PostgresSettings {
+            params: parameters,
+            cancel_closure,
+            delayed_notice,
+        })
+    }
 }

 impl ConnectInfo {
@@ -268,51 +343,42 @@ impl ConnectInfo {
 pub type RustlsStream = <ComputeConfig as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
 pub type MaybeRustlsStream = MaybeTlsStream<tokio::net::TcpStream, RustlsStream>;

-pub(crate) struct PostgresConnection {
-    /// Socket connected to a compute node.
-    pub(crate) stream: MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
+// TODO(conrad): we don't need to parse these.
+// These are just immediately forwarded back to the client.
+// We could instead stream them out instead of reading them into memory.
+pub struct PostgresSettings {
    /// PostgreSQL connection parameters.
-    pub(crate) params: std::collections::HashMap<String, String>,
+    pub params: std::collections::HashMap<String, String>,
    /// Query cancellation token.
-    pub(crate) cancel_closure: CancelClosure,
-    /// Labels for proxy's metrics.
-    pub(crate) aux: MetricsAuxInfo,
+    pub cancel_closure: CancelClosure,
    /// Notices received from compute after authenticating
-    pub(crate) delayed_notice: Vec<NoticeResponseBody>,
+    pub delayed_notice: Vec<NoticeResponseBody>,
+}

-    pub(crate) guage: NumDbConnectionsGuard<'static>,
+pub struct ComputeConnection {
+    /// Socket connected to a compute node.
+    pub stream: MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
+    pub hostname: Host,
+    pub ssl_mode: SslMode,
+    pub socket_addr: SocketAddr,
+    pub guage: NumDbConnectionsGuard<'static>,
 }

 impl ConnectInfo {
    /// Connect to a corresponding compute node.
-    pub(crate) async fn connect(
+    pub async fn connect(
        &self,
        ctx: &RequestContext,
-        aux: MetricsAuxInfo,
-        auth: &AuthInfo,
+        aux: &MetricsAuxInfo,
        config: &ComputeConfig,
-        user_info: ComputeUserInfo,
-    ) -> Result<PostgresConnection, ConnectionError> {
-        let mut tmp_config = auth.enrich(self.to_postgres_client_config());
-        // we setup SSL early in `ConnectInfo::connect_raw`.
-        tmp_config.ssl_mode(SslMode::Disable);
-
+    ) -> Result<ComputeConnection, ConnectionError> {
        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
        let (socket_addr, stream) = self.connect_raw(config).await?;
-        let connection = tmp_config.connect_raw(stream, NoTls).await?;
        drop(pause);

-        let RawConnection {
-            stream,
-            parameters,
-            delayed_notice,
-            process_id,
-            secret_key,
-        } = connection;
-
-        tracing::Span::current().record("pid", tracing::field::display(process_id));
        tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id));
-        let MaybeTlsStream::Raw(stream) = stream.into_inner();

        // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
        info!(
@@ -324,25 +390,12 @@ impl ConnectInfo {
            ctx.get_testodrome_id().unwrap_or_default(),
        );

-        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
-        // Yet another reason to rework the connection establishing code.
-        let cancel_closure = CancelClosure::new(
-            socket_addr,
-            RawCancelToken {
-                ssl_mode: self.ssl_mode,
-                process_id,
-                secret_key,
-            },
-            self.host.to_string(),
-            user_info,
-        );
-
-        let connection = PostgresConnection {
+        let connection = ComputeConnection {
            stream,
-            params: parameters,
-            delayed_notice,
-            cancel_closure,
-            aux,
+            socket_addr,
+            hostname: self.host.clone(),
+            ssl_mode: self.ssl_mode,
+            aux: aux.clone(),
            guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
        };

--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -22,7 +22,6 @@ pub struct ProxyConfig {
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub proxy_protocol_v2: ProxyProtocolV2,
-    pub region: String,
    pub handshake_timeout: Duration,
    pub wake_compute_retry_config: RetryConfig,
    pub connect_compute_locks: ApiLocks<Host>,
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -89,12 +89,7 @@ pub async fn task_main(
                }
            }

-            let ctx = RequestContext::new(
-                session_id,
-                conn_info,
-                crate::metrics::Protocol::Tcp,
-                &config.region,
-            );
+            let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp);

            let res = handle_client(
                config,
@@ -218,11 +213,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
    };
    auth_info.set_startup_params(&params, true);

-    let node = connect_to_compute(
+    let mut node = connect_to_compute(
        ctx,
        &TcpMechanism {
-            user_info,
-            auth: auth_info,
            locks: &config.connect_compute_locks,
        },
        &node_info,
@@ -232,9 +225,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
    .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
    .await?;

+    let pg_settings = auth_info
+        .authenticate(ctx, &mut node, user_info)
+        .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
+        .await?;
+
    let session = cancellation_handler.get_key();

-    prepare_client_connection(&node, *session.key(), &mut stream);
+    prepare_client_connection(&pg_settings, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

    let session_id = ctx.session_id();
@@ -244,7 +242,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
            .maintain_cancel_key(
                session_id,
                cancel,
-                &node.cancel_closure,
+                &pg_settings.cancel_closure,
                &config.connect_to_compute,
            )
            .await;
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -46,7 +46,6 @@ struct RequestContextInner {
    pub(crate) session_id: Uuid,
    pub(crate) protocol: Protocol,
    first_packet: chrono::DateTime<Utc>,
-    region: &'static str,
    pub(crate) span: Span,

    // filled in as they are discovered
@@ -94,7 +93,6 @@ impl Clone for RequestContext {
            session_id: inner.session_id,
            protocol: inner.protocol,
            first_packet: inner.first_packet,
-            region: inner.region,
            span: info_span!("background_task"),

            project: inner.project,
@@ -124,12 +122,7 @@ impl Clone for RequestContext {
 }

 impl RequestContext {
-    pub fn new(
-        session_id: Uuid,
-        conn_info: ConnectionInfo,
-        protocol: Protocol,
-        region: &'static str,
-    ) -> Self {
+    pub fn new(session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol) -> Self {
        // TODO: be careful with long lived spans
        let span = info_span!(
            "connect_request",
@@ -145,7 +138,6 @@ impl RequestContext {
            session_id,
            protocol,
            first_packet: Utc::now(),
-            region,
            span,

            project: None,
@@ -179,7 +171,7 @@ impl RequestContext {
        let ip = IpAddr::from([127, 0, 0, 1]);
        let addr = SocketAddr::new(ip, 5432);
        let conn_info = ConnectionInfo { addr, extra: None };
-        RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
+        RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp)
    }

    pub(crate) fn console_application_name(&self) -> String {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;

 #[derive(parquet_derive::ParquetRecordWriter)]
 pub(crate) struct RequestData {
-    region: &'static str,
+    region: String,
    protocol: &'static str,
    /// Must be UTC. The derive macro doesn't like the timezones
    timestamp: chrono::NaiveDateTime,
@@ -147,7 +147,7 @@ impl From<&RequestContextInner> for RequestData {
            }),
            jwt_issuer: value.jwt_issuer.clone(),
            protocol: value.protocol.as_str(),
-            region: value.region,
+            region: String::new(),
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
            cold_start_info: value.cold_start_info.as_str(),
@@ -167,6 +167,7 @@ impl From<&RequestContextInner> for RequestData {
 pub async fn worker(
    cancellation_token: CancellationToken,
    config: ParquetUploadArgs,
+    region: String,
 ) -> anyhow::Result<()> {
    let Some(remote_storage_config) = config.parquet_upload_remote_storage else {
        tracing::warn!("parquet request upload: no s3 bucket configured");
@@ -232,12 +233,17 @@ pub async fn worker(
                .context("remote storage for disconnect events init")?;
        let parquet_config_disconnect = parquet_config.clone();
        tokio::try_join!(
-            worker_inner(storage, rx, parquet_config),
-            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+            worker_inner(storage, rx, parquet_config, &region),
+            worker_inner(
+                storage_disconnect,
+                rx_disconnect,
+                parquet_config_disconnect,
+                &region
+            )
        )
        .map(|_| ())
    } else {
-        worker_inner(storage, rx, parquet_config).await
+        worker_inner(storage, rx, parquet_config, &region).await
    }
 }

@@ -257,6 +263,7 @@ async fn worker_inner(
    storage: GenericRemoteStorage,
    rx: impl Stream<Item = RequestData>,
    config: ParquetConfig,
+    region: &str,
 ) -> anyhow::Result<()> {
    #[cfg(any(test, feature = "testing"))]
    let storage = if config.test_remote_failures > 0 {
@@ -277,7 +284,8 @@ async fn worker_inner(
    let mut last_upload = time::Instant::now();

    let mut len = 0;
-    while let Some(row) = rx.next().await {
+    while let Some(mut row) = rx.next().await {
+        region.clone_into(&mut row.region);
        rows.push(row);
        let force = last_upload.elapsed() > config.max_duration;
        if rows.len() == config.rows_per_group || force {
@@ -533,7 +541,7 @@ mod tests {
            auth_method: None,
            jwt_issuer: None,
            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
-            region: "us-east-1",
+            region: String::new(),
            error: None,
            success: rng.r#gen(),
            cold_start_info: "no",
@@ -565,7 +573,9 @@ mod tests {
            .await
            .unwrap();

-        worker_inner(storage, rx, config).await.unwrap();
+        worker_inner(storage, rx, config, "us-east-1")
+            .await
+            .unwrap();

        let mut files = WalkDir::new(tmpdir.as_std_path())
            .into_iter()
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -76,13 +76,9 @@ impl NodeInfo {
    pub(crate) async fn connect(
        &self,
        ctx: &RequestContext,
-        auth: &compute::AuthInfo,
        config: &ComputeConfig,
-        user_info: ComputeUserInfo,
-    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.conn_info
-            .connect(ctx, self.aux.clone(), auth, config, user_info)
-            .await
+    ) -> Result<compute::ComputeConnection, compute::ConnectionError> {
+        self.conn_info.connect(ctx, &self.aux, config).await
    }
 }

--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -2,8 +2,7 @@ use async_trait::async_trait;
 use tokio::time;
 use tracing::{debug, info, warn};

-use crate::auth::backend::ComputeUserInfo;
-use crate::compute::{self, AuthInfo, COULD_NOT_CONNECT, PostgresConnection};
+use crate::compute::{self, COULD_NOT_CONNECT, ComputeConnection};
 use crate::config::{ComputeConfig, RetryConfig};
 use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
@@ -50,15 +49,13 @@ pub(crate) trait ConnectMechanism {
 }

 pub(crate) struct TcpMechanism {
-    pub(crate) auth: AuthInfo,
    /// connect_to_compute concurrency lock
    pub(crate) locks: &'static ApiLocks<Host>,
-    pub(crate) user_info: ComputeUserInfo,
 }

 #[async_trait]
 impl ConnectMechanism for TcpMechanism {
-    type Connection = PostgresConnection;
+    type Connection = ComputeConnection;
    type ConnectError = compute::ConnectionError;
    type Error = compute::ConnectionError;

@@ -71,13 +68,9 @@ impl ConnectMechanism for TcpMechanism {
        ctx: &RequestContext,
        node_info: &control_plane::CachedNodeInfo,
        config: &ComputeConfig,
-    ) -> Result<PostgresConnection, Self::Error> {
+    ) -> Result<ComputeConnection, Self::Error> {
        let permit = self.locks.get_permit(&node_info.conn_info.host).await?;
-        permit.release_result(
-            node_info
-                .connect(ctx, &self.auth, config, self.user_info.clone())
-                .await,
-        )
+        permit.release_result(node_info.connect(ctx, config).await)
    }
 }

--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -122,12 +122,7 @@ pub async fn task_main(
                }
            }

-            let ctx = RequestContext::new(
-                session_id,
-                conn_info,
-                crate::metrics::Protocol::Tcp,
-                &config.region,
-            );
+            let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp);

            let res = handle_client(
                config,
@@ -357,24 +352,28 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
    let res = connect_to_compute(
        ctx,
        &TcpMechanism {
-            user_info: creds.info.clone(),
-            auth: auth_info,
            locks: &config.connect_compute_locks,
        },
-        &auth::Backend::ControlPlane(cplane, creds.info),
+        &auth::Backend::ControlPlane(cplane, creds.info.clone()),
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
    .await;

-    let node = match res {
+    let mut node = match res {
        Ok(node) => node,
        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
    };

+    let pg_settings = auth_info.authenticate(ctx, &mut node, creds.info).await;
+    let pg_settings = match pg_settings {
+        Ok(pg_settings) => pg_settings,
+        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
+    };
+
    let session = cancellation_handler.get_key();

-    prepare_client_connection(&node, *session.key(), &mut stream);
+    prepare_client_connection(&pg_settings, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

    let session_id = ctx.session_id();
@@ -384,7 +383,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
            .maintain_cancel_key(
                session_id,
                cancel,
-                &node.cancel_closure,
+                &pg_settings.cancel_closure,
                &config.connect_to_compute,
            )
            .await;
@@ -413,19 +412,19 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(

 /// Finish client connection initialization: confirm auth success, send params, etc.
 pub(crate) fn prepare_client_connection(
-    node: &compute::PostgresConnection,
+    settings: &compute::PostgresSettings,
    cancel_key_data: CancelKeyData,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) {
    // Forward all deferred notices to the client.
-    for notice in &node.delayed_notice {
+    for notice in &settings.delayed_notice {
        stream.write_raw(notice.as_bytes().len(), b'N', |buf| {
            buf.extend_from_slice(notice.as_bytes());
        });
    }

    // Forward all postgres connection params to the client.
-    for (name, value) in &node.params {
+    for (name, value) in &settings.params {
        stream.write_message(BeMessage::ParameterStatus {
            name: name.as_bytes(),
            value: value.as_bytes(),
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -99,7 +99,6 @@ impl ShouldRetryWakeCompute for postgres_client::Error {
 impl CouldRetry for compute::ConnectionError {
    fn could_retry(&self) -> bool {
        match self {
-            compute::ConnectionError::Postgres(err) => err.could_retry(),
            compute::ConnectionError::TlsError(err) => err.could_retry(),
            compute::ConnectionError::WakeComputeError(err) => err.could_retry(),
            compute::ConnectionError::TooManyConnectionAttempts(_) => false,
@@ -109,7 +108,6 @@ impl CouldRetry for compute::ConnectionError {
 impl ShouldRetryWakeCompute for compute::ConnectionError {
    fn should_retry_wake_compute(&self) -> bool {
        match self {
-            compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(),
            // the cache entry was not checked for validity
            compute::ConnectionError::TooManyConnectionAttempts(_) => false,
            _ => true,
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -169,7 +169,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
        .dbname("db")
        .password("password")
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await?;

    proxy.await?
@@ -252,7 +252,7 @@ async fn connect_failure(
        .dbname("db")
        .password("password")
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await
        .err()
        .context("client shouldn't be able to connect")?;
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -199,7 +199,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
        .user("john_doe")
        .dbname("earth")
        .ssl_mode(SslMode::Disable)
-        .connect_raw(server, NoTls)
+        .tls_and_authenticate(server, NoTls)
        .await
        .err() // -> Option<E>
        .context("client shouldn't be able to connect")?;
@@ -228,7 +228,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
        .user("john_doe")
        .dbname("earth")
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await?;

    proxy.await?
@@ -245,7 +245,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
        .dbname("earth")
        .set_param("options", "project=generic-project-name")
        .ssl_mode(SslMode::Prefer)
-        .connect_raw(server, NoTls)
+        .tls_and_authenticate(server, NoTls)
        .await?;

    proxy.await?
@@ -293,7 +293,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
        .dbname("db")
        .password(password)
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await?;

    proxy.await?
@@ -317,7 +317,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
        .dbname("db")
        .password("password")
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await?;

    proxy.await?
@@ -344,7 +344,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
        .dbname("db")
        .password(&password) // no password will match the mocked secret
        .ssl_mode(SslMode::Require)
-        .connect_raw(server, client_config.make_tls_connect()?)
+        .tls_and_authenticate(server, client_config.make_tls_connect()?)
        .await
        .err() // -> Option<E>
        .context("client shouldn't be able to connect")?;
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -139,12 +139,6 @@ impl RateBucketInfo {
        Self::new(200, Duration::from_secs(600)),
    ];

-    // For all the sessions will be cancel key. So this limit is essentially global proxy limit.
-    pub const DEFAULT_REDIS_SET: [Self; 2] = [
-        Self::new(100_000, Duration::from_secs(1)),
-        Self::new(50_000, Duration::from_secs(10)),
-    ];
-
    pub fn rps(&self) -> f64 {
        (self.max_rpi as f64) / self.interval.as_secs_f64()
    }
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -23,9 +23,8 @@ impl KeyPrefix {

 #[cfg(test)]
 mod tests {
-    use crate::pqproto::id_to_cancel_key;
-
    use super::*;
+    use crate::pqproto::id_to_cancel_key;

    #[test]
    fn test_build_redis_key() {
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -5,11 +5,9 @@ use redis::aio::ConnectionLike;
 use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};

 pub struct RedisKVClient {
    client: ConnectionWithCredentialsProvider,
-    limiter: GlobalRateLimiter,
 }

 #[allow(async_fn_in_trait)]
@@ -30,11 +28,8 @@ impl Queryable for Cmd {
 }

 impl RedisKVClient {
-    pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
-        Self {
-            client,
-            limiter: GlobalRateLimiter::new(info.into()),
-        }
+    pub fn new(client: ConnectionWithCredentialsProvider) -> Self {
+        Self { client }
    }

    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
@@ -49,11 +44,6 @@ impl RedisKVClient {
        &mut self,
        q: &impl Queryable,
    ) -> anyhow::Result<T> {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping query");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
        let e = match q.query(&mut self.client).await {
            Ok(t) => return Ok(t),
            Err(e) => e,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	0618845bbb	Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517	2025-06-26 07:32:08 -07:00
Dmitrii Kovalkov	605fb04f89	pageserver: use bounded sender for basebackup cache (#12342 ) ## Problem Basebackup cache now uses unbounded channel for prepare requests. In theory it can grow large if the cache is hung and does not process the requests. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Replace an unbounded channel with a bounded one, the size is configurable. - Add `pageserver_basebackup_cache_prepare_queue_size` to observe the size of the queue. - Refactor a bit to move all metrics logic to `basebackup_cache.rs`	2025-06-26 13:26:24 +00:00
Conrad Ludgate	fd1e8ec257	[proxy] review and cleanup CLI args (#12167 ) I was looking at how we could expose our proxy config as toml again, and as I was writing out the schema format, I noticed some cruft in our CLI args that no longer seem to be in use. The redis change is the most complex, but I am pretty sure it's sound. Since https://github.com/neondatabase/cloud/pull/15613 cplane longer publishes to the global redis instance.	2025-06-26 11:25:41 +00:00
Konstantin Knizhnik	be23eae3b6	Mark pages as avaiable in LFC only after generation check (#12350 ) ## Problem If LFC generation is changed then `lfc_readv_select` will return -1 but pages are still marked as available in bitmap. ## Summary of changes Update bitmap after generation check. Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>	2025-06-26 07:06:27 +00:00
Alex Chi Z.	6f70885e11	fix(pageserver): allow refresh_interval to be empty (#12349 ) ## Problem Fix for https://github.com/neondatabase/neon/pull/12324 ## Summary of changes Need `serde(default)` to allow this field not present in the config, otherwise there will be a config deserialization error. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-06-25 22:15:03 +00:00
Erik Grinaker	f755979102	pageserver: payload compression for gRPC base backups (#12346 ) ## Problem gRPC base backups use gRPC compression. However, this has two problems: * Base backup caching will cache compressed base backups (making gRPC compression pointless). * Tonic does not support varying the compression level, and zstd default level is 10% slower than gzip fastest level. Touches https://github.com/neondatabase/neon/issues/11728. Touches https://github.com/neondatabase/cloud/issues/29353. ## Summary of changes This patch adds a gRPC parameter `BaseBackupRequest::compression` specifying the compression algorithm. It also moves compression into `send_basebackup_tarball` to reduce code duplication. A follow-up PR will integrate the base backup cache with gRPC.	2025-06-25 18:16:23 +00:00
Matthias van de Meent	1d49eefbbb	RFC: Endpoint Persistent Unlogged Files Storage (#9661 ) ## Summary A design for a storage system that allows storage of files required to make Neon's Endpoints have a better experience at or after a reboot. ## Motivation Several systems inside PostgreSQL (and Neon) need some persistent storage for optimal workings across reboots and restarts, but still work without. Examples are the cumulative statistics file in `pg_stat/global.stat`, `pg_stat_statements`' `pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. We need a storage system that can store and manage these files for each Endpoint. [GH rendered file](https://github.com/neondatabase/neon/blob/MMeent/rfc-unlogged-file/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md) Part of https://github.com/neondatabase/cloud/issues/24225	2025-06-25 16:25:57 +00:00
Alex Chi Z.	6c77638ea1	feat(storcon): retrieve feature flag and pass to pageservers (#12324 ) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes It costs $$$ to directly retrieve the feature flags from the pageserver. Therefore, this patch adds new APIs to retrieve the spec from the storcon and updates it via pageserver. * Storcon retrieves the feature flag and send it to the pageservers. * If the feature flag gets updated outside of the normal refresh loop of the pageserver, pageserver won't fetch the flags on its own as long as the last updated time <= refresh_period. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-06-25 14:58:18 +00:00
Conrad Ludgate	517a3d0d86	[proxy]: BatchQueue::call is not cancel safe - make it directly cancellation aware (#12345 ) ## Problem https://github.com/neondatabase/cloud/issues/30539 If the current leader cancels the `call` function, then it has removed the jobs from the queue, but will never finish sending the responses. Because of this, it is not cancellation safe. ## Summary of changes Document these functions as not cancellation safe. Move cancellation of the queued jobs into the queue itself. ## Alternatives considered 1. We could spawn the task that runs the batch, since that won't get cancelled. * This requires `fn call(self: Arc<Self>)` or `fn call(&'static self)`. 2. We could add another scopeguard and return the requests back to the queue. * This requires that requests are always retry safe, and also requires requests to be `Clone`.	2025-06-25 14:19:20 +00:00
Conrad Ludgate	27ca1e21be	[console_redirect_proxy]: fix channel binding (#12238 ) ## Problem While working more on TLS to compute, I realised that Console Redirect -> pg-sni-router -> compute would break if channel binding was set to prefer. This is because the channel binding data would differ between Console Redirect -> pg-sni-router vs pg-sni-router -> compute. I also noticed that I actually disabled channel binding in #12145, since `connect_raw` would think that the connection didn't support TLS. ## Summary of changes Make sure we specify the channel binding. Make sure that `connect_raw` can see if we have TLS support.	2025-06-25 13:41:30 +00:00
Arpad Müller	1dc01c9bed	Support cancellations of timelines with hanging ondemand downloads (#12330 ) In `test_layer_download_cancelled_by_config_location`, we simulate hung downloads via the `before-downloading-layer-stream-pausable` failpoint. Then, we cancel a timeline via the `location_config` endpoint. With the new default as of https://github.com/neondatabase/neon/pull/11712, we would be creating the timeline on safekeepers regardless if there have been writes or not, and it turns out the test relied on the timeline not existing on safekeepers, due to a cancellation bug: * as established before, the test makes the read path hang * the timeline cancellation function first cancels the walreceiver, and only then cancels the timeline's token * `WalIngest::new` is requesting a checkpoint, which hits the read path * at cancellation time, we'd be hanging inside the read, not seeing the cancellation of the walreceiver * the test would time out due to the hang This is probably also reproducible in the wild when there is S3 unavailabilies or bottlenecks. So we thought that it's worthwhile to fix the hang issue. The approach chosen in the end involves the `tokio::select` macro. In PR 11712, we originally punted on the test due to the hang and opted it out from the new default, but now we can use the new default. Part of https://github.com/neondatabase/neon/issues/12299	2025-06-25 13:40:38 +00:00
Heikki Linnakangas	7c4c36f5ac	Remove unnecessary separate installation of libpq (#12287 ) `make install` compiles and installs libpq. Remove redundant separate step to compile and install it.	2025-06-25 10:47:56 +00:00
Tristan Partin	a2d623696c	Update pgaudit to latest versions (#12328 ) These updates contain some bug fixes and are completely backwards compatible with what we currently support in Neon. Link: https://github.com/pgaudit/pgaudit/compare/1.6.2...1.6.3 Link: https://github.com/pgaudit/pgaudit/compare/1.7.0...1.7.1 Link: https://github.com/pgaudit/pgaudit/compare/16.0...16.1 Link: https://github.com/pgaudit/pgaudit/compare/17.0...17.1 Signed-off-by: Tristan Partin <tristan.partin@databricks.com> Signed-off-by: Tristan Partin <tristan.partin@databricks.com>	2025-06-25 09:03:02 +00:00
Tristan Partin	aa75722010	Set pgaudit.log=none for monitoring connections (#12137 ) pgaudit can spam logs due to all the monitoring that we do. Logs from these connections are not necessary for HIPPA compliance, so we can stop logging from those connections. Part-of: https://github.com/neondatabase/cloud/issues/29574 Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-06-24 17:42:23 +00:00
Matthias van de Meent	6c6de6382a	Use enum-typed PG versions (#12317 ) This makes it possible for the compiler to validate that a match block matched all PostgreSQL versions we support. ## Problem We did not have a complete picture about which places we had to test against PG versions, and what format these versions were: The full PG version ID format (Major/minor/bugfix `MMmmbb`) as transfered in protocol messages, or only the Major release version (`MM`). This meant type confusion was rampant. With this change, it becomes easier to develop new version-dependent features, by making type and niche confusion impossible. ## Summary of changes Every use of `pg_version` is now typed as either `PgVersionId` (u32, valued in decimal `MMmmbb`) or PgMajorVersion (an enum, with a value for every major version we support, serialized and stored like a u32 with the value of that major version) --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2025-06-24 17:25:31 +00:00
Dmitry Savelev	158d84ea30	Switch the billing metrics storage format to ndjson. (#12338 ) ## Problem The billing team wants to change the billing events pipeline and use a common events format in S3 buckets across different event producers. ## Summary of changes Change the events storage format for billing events from JSON to NDJSON. Resolves: https://github.com/neondatabase/cloud/issues/29994	2025-06-24 15:36:36 +00:00
Conrad Ludgate	4dd9ca7b04	[proxy]: authenticate to compute after connect_to_compute (#12335 ) ## Problem PGLB will do the connect_to_compute logic, neonkeeper will do the session establishment logic. We should split it. ## Summary of changes Moves postgres authentication to compute to a separate routine that happens after connect_to_compute.	2025-06-24 14:15:36 +00:00
John Spray	e8c39d260a	Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517	2025-06-05 13:07:03 +01:00
Dmitrii Kovalkov	b7050ddc5f	Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517	2025-06-03 12:56:51 +04:00
Dmitrii Kovalkov	aeb1b6fd61	Add grpc_auth_type check to pageserver	2025-06-03 10:55:29 +02:00
John Spray	357362a998	Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517	2025-06-02 11:43:31 +01:00
John Spray	105076f12b	tests: update test_safekeeper_without_pageserver for --dev	2025-05-30 22:49:11 +02:00
John Spray	9f0538ff86	tests: use dev mode	2025-05-30 16:22:22 +02:00
John Spray	0a74ed6f9e	safekeeper: make auth mandatory unless dev mode	2025-05-30 16:21:37 +02:00
John Spray	52937ca78e	pageserver: make auth mandatory unless dev mode	2025-05-30 16:21:37 +02:00