Merge remote-tracking branch 'origin/main' into HEAD

2026-01-08 05:52:55 +00:00 · 2025-07-20 00:58:57 +03:00
parent cb50291dcd 791b5d736b
commit e2c3c2eccb
70 changed files with 2396 additions and 1148 deletions
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,16 +46,33 @@ pub struct ExtensionInstallResponse {
    pub version: ExtVersion,
 }

+/// Status of the LFC prewarm process. The same state machine is reused for
+/// both autoprewarm (prewarm after compute/Postgres start using the previously
+/// stored LFC state) and explicit prewarming via API.
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
+    /// Default value when compute boots up.
    #[default]
    NotPrewarmed,
+    /// Prewarming thread is active and loading pages into LFC.
    Prewarming,
+    /// We found requested LFC state in the endpoint storage and
+    /// completed prewarming successfully.
    Completed,
-    Failed {
-        error: String,
-    },
+    /// Unexpected error happened during prewarming. Note, `Not Found 404`
+    /// response from the endpoint storage is explicitly excluded here
+    /// because it can normally happen on the first compute start,
+    /// since LFC state is not available yet.
+    Failed { error: String },
+    /// We tried to fetch the corresponding LFC state from the endpoint storage,
+    /// but received `Not Found 404`. This should normally happen only during the
+    /// first endpoint start after creation with `autoprewarm: true`.
+    ///
+    /// During the orchestrated prewarm via API, when a caller explicitly
+    /// provides the LFC state key to prewarm from, it's the caller responsibility
+    /// to handle this status as an error state in this case.
+    Skipped,
 }

 impl Display for LfcPrewarmState {
@@ -64,6 +81,7 @@ impl Display for LfcPrewarmState {
            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
            LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Skipped => f.write_str("Skipped"),
            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
        }
    }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -14,6 +14,7 @@ use serde::{Deserialize, Serialize};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use utils::shard::{ShardCount, ShardIndex};

 use crate::responses::TlsConfig;

@@ -106,11 +107,18 @@ pub struct ComputeSpec {
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,

-    // Pageserver information can be passed in two different ways:
-    // 1. Here
-    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    /// Pageserver information can be passed in three different ways:
+    /// 1. Here in `pageserver_connection_info`
+    /// 2. In the `pageserver_connstring` field.
+    /// 3. in `cluster.settings`.
+    ///
+    /// The goal is to use method 1. everywhere. But for backwards-compatibility with old
+    /// versions of the control plane, `compute_ctl` will check 2. and 3. if the
+    /// `pageserver_connection_info` field is missing.
    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

+    pub pageserver_connstring: Option<String>,
+
    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
    pub project_id: Option<String>,
@@ -145,7 +153,7 @@ pub struct ComputeSpec {

    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
-    pub shard_stripe_size: Option<usize>,
+    pub shard_stripe_size: Option<u32>,

    /// Local Proxy configuration used for JWT authentication
    #[serde(default)]
@@ -218,16 +226,28 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

-/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
-#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
 pub struct PageserverConnectionInfo {
-    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+    /// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
+    pub shard_count: ShardCount,

-    pub prefer_grpc: bool,
+    /// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
+    pub stripe_size: Option<u32>,
+
+    pub shards: HashMap<ShardIndex, PageserverShardInfo>,
+
+    #[serde(default)]
+    pub prefer_protocol: PageserverProtocol,
 }

-#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardInfo {
+    pub pageservers: Vec<PageserverShardConnectionInfo>,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
 pub struct PageserverShardConnectionInfo {
+    pub id: Option<String>,
    pub libpq_url: Option<String>,
    pub grpc_url: Option<String>,
 }
@@ -465,13 +485,15 @@ pub struct JwksSettings {
    pub jwt_audience: Option<String>,
 }

-/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
-#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+/// Protocol used to connect to a Pageserver.
+#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub enum PageserverProtocol {
    /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
    #[default]
+    #[serde(rename = "libpq")]
    Libpq,
    /// A newer, gRPC-based protocol. Uses grpc:// scheme.
+    #[serde(rename = "grpc")]
    Grpc,
 }

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,12 +4,14 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]

+use std::sync::RwLock;
+
 use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels};
 use measured::metric::counter::CounterState;
 use measured::metric::gauge::GaugeState;
 use measured::metric::group::Encoding;
 use measured::metric::name::{MetricName, MetricNameEncoder};
-use measured::metric::{MetricEncoding, MetricFamilyEncoding};
+use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
 use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup};
 use once_cell::sync::Lazy;
 use prometheus::Registry;
@@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
        .collect()
 }

+pub struct InfoMetric<L: LabelGroup, M: MetricType = GaugeState> {
+    label: RwLock<L>,
+    metric: M,
+}
+
+impl<L: LabelGroup> InfoMetric<L> {
+    pub fn new(label: L) -> Self {
+        Self::with_metric(label, GaugeState::new(1))
+    }
+}
+
+impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
+    pub fn with_metric(label: L, metric: M) -> Self {
+        Self {
+            label: RwLock::new(label),
+            metric,
+        }
+    }
+
+    pub fn set_label(&self, label: L) {
+        *self.label.write().unwrap() = label;
+    }
+}
+
+impl<L, M, E> MetricFamilyEncoding<E> for InfoMetric<L, M>
+where
+    L: LabelGroup,
+    M: MetricEncoding<E, Metadata = ()>,
+    E: Encoding,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut E,
+    ) -> Result<(), E::Err> {
+        M::write_type(&name, enc)?;
+        self.metric
+            .collect_into(&(), &*self.label.read().unwrap(), name, enc)
+    }
+}
+
 pub struct BuildInfo {
    pub revision: &'static str,
    pub build_tag: &'static str,
 }

-// todo: allow label group without the set
 impl LabelGroup for BuildInfo {
    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
        const REVISION: &LabelName = LabelName::from_str("revision");
@@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo {
    }
 }

-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct NeonMetrics {
@@ -165,8 +189,8 @@ pub struct NeonMetrics {
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
+    #[metric(init = InfoMetric::new(build_info))]
+    build_info: InfoMetric<BuildInfo>,

    #[metric(flatten)]
    rusage: Rusage,
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -8,20 +8,19 @@ license.workspace = true
 thiserror.workspace = true
 nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
-rustc-hash = { version = "2.1.1" }
-rand = "0.9.1"
 libc.workspace = true
-lock_api = "0.4.13"
+lock_api.workspace = true
+rustc-hash.workspace = true

 [dev-dependencies]
 criterion = { workspace = true, features = ["html_reports"] }
+rand = "0.9"
 rand_distr = "0.5.1"
 xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
 ahash.workspace = true
 twox-hash = { version = "2.1.1" }
 seahash = "4.1.0"
 hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
-foldhash = "0.1.5"


 [target.'cfg(target_os = "macos")'.dependencies]
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -13,6 +13,8 @@
 //! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
 //! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
 //! dictionary by rehashing all keys.
+//!
+//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.

 use std::fmt::Debug;
 use std::hash::{BuildHasher, Hash};
@@ -30,6 +32,19 @@ mod tests;
 use core::{Bucket, CoreHashMap, INVALID_POS};
 use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};

+use thiserror::Error;
+
+/// Error type for a hashmap shrink operation.
+#[derive(Error, Debug)]
+pub enum HashMapShrinkError {
+    /// There was an error encountered while resizing the memory area.
+    #[error("shmem resize failed: {0}")]
+    ResizeError(shmem::Error),
+    /// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
+    #[error("occupied entry in deallocated space found at {0}")]
+    RemainingEntries(usize),
+}
+
 /// This represents a hash table that (possibly) lives in shared memory.
 /// If a new process is launched with fork(), the child process inherits
 /// this struct.
@@ -147,8 +162,8 @@ impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
        };

        let hashmap = CoreHashMap::new(buckets, dictionary);
-        let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
        unsafe {
+            let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
            std::ptr::write(shared_ptr, lock);
        }

@@ -171,6 +186,9 @@ impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
    }

    /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    ///
+    /// This is a holdover from a previous implementation and is being kept around for
+    /// backwards compatibility reasons.
    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
        self.attach_writer()
    }
@@ -184,8 +202,8 @@ impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
 ///
 /// [`libc::pthread_rwlock_t`]
 /// [`HashMapShared`]
-/// [buckets]
-/// [dictionary]
+/// buckets
+/// dictionary
 ///
 /// In between the above parts, there can be padding bytes to align the parts correctly.
 type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
@@ -310,6 +328,9 @@ where
    }

    /// Get a reference to the entry containing a key.
+    ///
+    /// NB: This takes a write lock as there's no way to distinguish whether the intention
+    /// is to use the entry for reading or for writing in advance.
    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
        let hash = self.get_hash_value(&key);
        self.entry_with_hash(key, hash)
@@ -317,7 +338,7 @@ where

    /// Remove a key given its hash. Returns the associated value if it existed.
    pub fn remove(&self, key: &K) -> Option<V> {
-        let hash = self.get_hash_value(&key);
+        let hash = self.get_hash_value(key);
        match self.entry_with_hash(key.clone(), hash) {
            Entry::Occupied(e) => Some(e.remove()),
            Entry::Vacant(_) => None,
@@ -355,7 +376,7 @@ where
            Some((key, _)) => Some(OccupiedEntry {
                _key: key.clone(),
                bucket_pos: pos as u32,
-                prev_pos: entry::PrevPos::Unknown(self.get_hash_value(&key)),
+                prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)),
                map,
            }),
            _ => None,
@@ -550,12 +571,7 @@ where
    /// The following cases result in a panic:
    /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
    /// - Calling this function on a map when no shrink operation is in progress.
-    /// - Calling this function on a map with `shrink_mode` set to [`HashMapShrinkMode::Remap`] and
-    ///   there are more buckets in use than the value returned by [`HashMapAccess::shrink_goal`].
-    ///
-    /// # Errors
-    /// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
-    pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
+    pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> {
        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
        assert!(
            map.alloc_limit != INVALID_POS,
@@ -574,10 +590,8 @@ where
        );

        for i in (num_buckets as usize)..map.buckets.len() {
-            if let Some((k, v)) = map.buckets[i].inner.take() {
-                // alloc_bucket increases count, so need to decrease since we're just moving
-                map.buckets_in_use -= 1;
-                map.alloc_bucket(k, v).unwrap();
+            if map.buckets[i].inner.is_some() {
+                return Err(HashMapShrinkError::RemainingEntries(i));
            }
        }

@@ -587,7 +601,9 @@ where
            .expect("shrink called on a fixed-size hash table");

        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
-        shmem_handle.set_size(size_bytes)?;
+        if let Err(e) = shmem_handle.set_size(size_bytes) {
+            return Err(HashMapShrinkError::ResizeError(e));
+        }
        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
        let buckets_ptr = map.buckets.as_mut_ptr();
        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -43,9 +43,6 @@ pub(crate) struct CoreHashMap<'a, K, V> {
    pub(crate) alloc_limit: u32,
    /// The number of currently occupied buckets.
    pub(crate) buckets_in_use: u32,
-    // pub(crate) lock: libc::pthread_mutex_t,
-    // Unclear what the purpose of this is.
-    pub(crate) _user_list_head: u32,
 }

 impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
@@ -66,7 +63,7 @@ where

 /// Error for when there are no empty buckets left but one is needed.
 #[derive(Debug, PartialEq)]
-pub struct FullError();
+pub struct FullError;

 impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
    const FILL_FACTOR: f32 = 0.60;
@@ -118,7 +115,6 @@ impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
            buckets,
            free_head: 0,
            buckets_in_use: 0,
-            _user_list_head: INVALID_POS,
            alloc_limit: INVALID_POS,
        }
    }
@@ -179,7 +175,7 @@ impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
            pos = bucket.next;
        }
        if pos == INVALID_POS {
-            return Err(FullError());
+            return Err(FullError);
        }

        // Repair the freelist.
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -90,7 +90,6 @@ impl<K, V> OccupiedEntry<'_, '_, K, V> {
                self.map.dictionary[dict_pos as usize] = bucket.next;
            }
            PrevPos::Chained(bucket_pos) => {
-                // println!("we think prev of {} is {bucket_pos}", self.bucket_pos);
                self.map.buckets[bucket_pos as usize].next = bucket.next;
            }
            _ => unreachable!(),
@@ -125,9 +124,6 @@ impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
    /// Will return [`FullError`] if there are no unoccupied buckets in the map.
    pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
        let pos = self.map.alloc_bucket(self.key, value)?;
-        if pos == INVALID_POS {
-            return Err(FullError());
-        }
        self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
        self.map.dictionary[self.dict_pos as usize] = pos;

--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -164,16 +164,16 @@ fn do_deletes(
 fn do_shrink(
    writer: &mut HashMapAccess<TestKey, usize>,
    shadow: &mut BTreeMap<TestKey, usize>,
+    from: u32,
    to: u32,
 ) {
    assert!(writer.shrink_goal().is_none());
    writer.begin_shrink(to);
    assert_eq!(writer.shrink_goal(), Some(to as usize));
-    while writer.get_num_buckets_in_use() > to as usize {
-        let (k, _) = shadow.pop_first().unwrap();
-        let entry = writer.entry(k);
-        if let Entry::Occupied(e) = entry {
-            e.remove();
+    for i in to..from {
+        if let Some(entry) = writer.entry_at_bucket(i as usize) {
+            shadow.remove(&entry._key);
+            entry.remove();
        }
    }
    let old_usage = writer.get_num_buckets_in_use();
@@ -298,7 +298,7 @@ fn test_shrink() {
    let mut rng = rand::rng();

    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
-    do_shrink(&mut writer, &mut shadow, 1000);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
    assert_eq!(writer.get_num_buckets(), 1000);
    do_deletes(500, &mut writer, &mut shadow);
    do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
@@ -315,7 +315,7 @@ fn test_shrink_grow_seq() {

    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
    eprintln!("Shrinking to 750");
-    do_shrink(&mut writer, &mut shadow, 750);
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
    do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
    eprintln!("Growing to 1500");
    writer.grow(1500).unwrap();
@@ -324,7 +324,7 @@ fn test_shrink_grow_seq() {
    while shadow.len() > 100 {
        do_deletes(1, &mut writer, &mut shadow);
    }
-    do_shrink(&mut writer, &mut shadow, 200);
+    do_shrink(&mut writer, &mut shadow, 1500, 200);
    do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
    eprintln!("Growing to 10k");
    writer.grow(10000).unwrap();
@@ -349,8 +349,7 @@ fn test_bucket_ops() {
    let pos = match writer.entry(1.into()) {
        Entry::Occupied(e) => {
            assert_eq!(e._key, 1.into());
-            let pos = e.bucket_pos as usize;
-            pos
+            e.bucket_pos as usize
        }
        Entry::Vacant(_) => {
            panic!("Insert didn't affect entry");
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,5 +1,3 @@
-//! Shared memory utilities for neon communicator
-
 pub mod hash;
 pub mod shmem;
 pub mod sync;
--- a/libs/neon-shmem/src/sync.rs
+++ b/libs/neon-shmem/src/sync.rs
@@ -6,7 +6,7 @@ use std::ptr::NonNull;
 use nix::errno::Errno;

 pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
-pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
 pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
 pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
 pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
@@ -14,19 +14,34 @@ pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRw
 /// Shared memory read-write lock.
 pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);

+/// Simple macro that calls a function in the libc namespace and panics if return value is nonzero.
+macro_rules! libc_checked {
+    ($fn_name:ident ( $($arg:expr),* )) => {{
+        let res = libc::$fn_name($($arg),*);
+        if res != 0 {
+            panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res));
+        }
+    }};
+}
+
 impl PthreadRwLock {
-    pub fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
+    /// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock.
+    ///
+    /// # Safety
+    /// `lock` must be non-null. Every unsafe operation will panic in the event of an error.
+    pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
        unsafe {
            let mut attrs = MaybeUninit::uninit();
-            // Ignoring return value here - only possible error is OOM.
-            libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
-            libc::pthread_rwlockattr_setpshared(attrs.as_mut_ptr(), libc::PTHREAD_PROCESS_SHARED);
-            // TODO(quantumish): worth making this function return Result?
-            libc::pthread_rwlock_init(lock, attrs.as_mut_ptr());
+            libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr()));
+            libc_checked!(pthread_rwlockattr_setpshared(
+                attrs.as_mut_ptr(),
+                libc::PTHREAD_PROCESS_SHARED
+            ));
+            libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr()));
            // Safety: POSIX specifies that "any function affecting the attributes
            // object (including destruction) shall not affect any previously
            // initialized read-write locks".
-            libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
+            libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr()));
            Self(Some(NonNull::new_unchecked(lock)))
        }
    }
@@ -34,7 +49,7 @@ impl PthreadRwLock {
    fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
        match self.0 {
            None => {
-                panic!("PthreadRwLock constructed badly - something likely used RawMutex::INIT")
+                panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
            }
            Some(x) => x,
        }
@@ -45,31 +60,16 @@ unsafe impl lock_api::RawRwLock for PthreadRwLock {
    type GuardMarker = lock_api::GuardSend;
    const INIT: Self = Self(None);

-    fn lock_shared(&self) {
-        unsafe {
-            let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
-            if res != 0 {
-                panic!("rdlock failed with {}", Errno::from_raw(res));
-            }
-        }
-    }
-
    fn try_lock_shared(&self) -> bool {
        unsafe {
            let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
            match res {
                0 => true,
                libc::EAGAIN => false,
-                _ => panic!("try_rdlock failed with {}", Errno::from_raw(res)),
-            }
-        }
-    }
-
-    fn lock_exclusive(&self) {
-        unsafe {
-            let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
-            if res != 0 {
-                panic!("wrlock failed with {}", Errno::from_raw(res));
+                _ => panic!(
+                    "pthread_rwlock_tryrdlock failed with {}",
+                    Errno::from_raw(res)
+                ),
            }
        }
    }
@@ -85,20 +85,27 @@ unsafe impl lock_api::RawRwLock for PthreadRwLock {
        }
    }

-    unsafe fn unlock_exclusive(&self) {
+    fn lock_shared(&self) {
        unsafe {
-            let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
-            if res != 0 {
-                panic!("unlock failed with {}", Errno::from_raw(res));
-            }
+            libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr()));
        }
    }
+
+    fn lock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr()));
+        }
+    }
+
+    unsafe fn unlock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
+        }
+    }
+
    unsafe fn unlock_shared(&self) {
        unsafe {
-            let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
-            if res != 0 {
-                panic!("unlock failed with {}", Errno::from_raw(res));
-            }
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
        }
    }
 }
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -749,7 +749,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
                    match e {
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        err @ QueryError::Shutdown => {
+                            // Notify postgres of the connection shutdown at the libpq
+                            // protocol level. This avoids postgres having to tell apart
+                            // from an idle connection and a stale one, which is bug prone.
+                            let shutdown_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &shutdown_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Ok(ProcessMsgResult::Break);
+                        }
                        QueryError::SimulatedConnectionError => {
                            return Err(QueryError::SimulatedConnectionError);
                        }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -47,6 +47,7 @@ tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 tracing-utils.workspace = true
 rand.workspace = true
 scopeguard.workspace = true
+uuid.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 walkdir.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -12,7 +12,8 @@ use jsonwebtoken::{
    Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
 use pem::Pem;
-use serde::{Deserialize, Serialize, de::DeserializeOwned};
+use serde::{Deserialize, Deserializer, Serialize, de::DeserializeOwned};
+use uuid::Uuid;

 use crate::id::TenantId;

@@ -25,6 +26,11 @@ pub enum Scope {
    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
+    /// Provides access to all data for a specific tenant, but based on endpoint ID. This token scope
+    /// is only used by compute to fetch the spec for a specific endpoint. The spec contains a Tenant-scoped
+    /// token authorizing access to all data of a tenant, so the spec-fetch API requires a TenantEndpoint
+    /// scope token to ensure that untrusted compute nodes can't fetch spec for arbitrary endpoints.
+    TenantEndpoint,
    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
    /// Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
@@ -51,17 +57,43 @@ pub enum Scope {
    ControllerPeer,
 }

+fn deserialize_empty_string_as_none_uuid<'de, D>(deserializer: D) -> Result<Option<Uuid>, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let opt = Option::<String>::deserialize(deserializer)?;
+    match opt.as_deref() {
+        Some("") => Ok(None),
+        Some(s) => Uuid::parse_str(s)
+            .map(Some)
+            .map_err(serde::de::Error::custom),
+        None => Ok(None),
+    }
+}
+
 /// JWT payload. See docs/authentication.md for the format
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
    pub tenant_id: Option<TenantId>,
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        // Neon control plane includes this field as empty in the claims.
+        // Consider it None in those cases.
+        deserialize_with = "deserialize_empty_string_as_none_uuid"
+    )]
+    pub endpoint_id: Option<Uuid>,
    pub scope: Scope,
 }

 impl Claims {
    pub fn new(tenant_id: Option<TenantId>, scope: Scope) -> Self {
-        Self { tenant_id, scope }
+        Self {
+            tenant_id,
+            scope,
+            endpoint_id: None,
+        }
    }
 }

@@ -212,6 +244,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let expected_claims = Claims {
            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
+            endpoint_id: None,
        };

        // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
@@ -240,6 +273,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let claims = Claims {
            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
+            endpoint_id: None,
        };

        let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -53,6 +53,10 @@ impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
    pub const MIN: Self = Self(0);

+    pub fn unsharded() -> Self {
+        ShardCount(0)
+    }
+
    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
    /// as [`TenantShardId::unsharded`].