Increase neon_local http client to compute timeout in reconfigure.

Seems like 30s sometimes not enough when CI runners are overloaded, causing pull_timeline flakiness. ref https://github.com/neondatabase/neon/issues/9731#issuecomment-2532143008
Disable readstream's reliance on seqscan readahead (#9860 )
2026-02-08 13:10:37 +00:00 · 2024-12-11 14:10:37 +01:00 · 2024-12-11 00:51:05 +00:00 · 2024-12-10 19:42:52 +00:00 · 2024-12-10 17:00:47 +00:00 · 2024-12-10 16:26:56 +00:00
20 changed files with 314 additions and 260 deletions
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -810,7 +810,7 @@ impl Endpoint {
        }

        let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
            .build()
            .unwrap();
        let response = client
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4506,7 +4506,12 @@ impl Tenant {
                // - this timeline was created while we were finding cutoffs
                // - lsn for timestamp search fails for this timeline repeatedly
                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
-                    target.cutoffs = cutoffs.clone();
+                    let original_cutoffs = target.cutoffs.clone();
+                    // GC cutoffs should never go back
+                    target.cutoffs = GcCutoffs {
+                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
+                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                    }
                }
            }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -22,6 +22,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
 #include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
 	 */
 	PSConnectionState state;
 	PGconn		   *conn;
+
+	/* request / response counters for debugging */
+	uint64			nrequests_sent;
+	uint64			nresponses_received;
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connected;
+		shard->nrequests_sent = 0;
+		shard->nresponses_received = 0;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	int			ret;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
+	instr_time	now,
+				start_ts,
+				since_start,
+				last_log_ts,
+				since_last_log;
+	bool		logged = false;
+
+	/*
+	 * As a debugging aid, if we don't get a response for a long time, print a
+	 * log message.
+	 *
+	 * 10 s is a very generous threshold, normally we expect a response in a
+	 * few milliseconds. We have metrics to track latencies in normal ranges,
+	 * but in the cases that take exceptionally long, it's useful to log the
+	 * exact timestamps.
+	 */
+#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+
+	INSTR_TIME_SET_CURRENT(now);
+	start_ts = last_log_ts = now;
+	INSTR_TIME_SET_ZERO(since_last_log);

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
 	if (ret == 0)
 	{
 		WaitEvent	event;
+		long		timeout;
+
+		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
 								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);

@@ -684,9 +716,40 @@ retry:
 			}
 		}

+		/*
+		 * Print a message to the log if a long time has passed with no
+		 * response.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
+		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, start_ts);
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received);
+			last_log_ts = now;
+			logged = true;
+		}
+
 		goto retry;
 	}

+	/*
+	 * If we logged earlier that the response is taking a long time, log
+	 * another message when the response is finally received.
+	 */
+	if (logged)
+	{
+		INSTR_TIME_SET_CURRENT(now);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, start_ts);
+		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+	}
+
 	return ret;
 }

@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
 	 * point, but on the grand scheme of things it's only a small issue.
 	 */
+	shard->nrequests_sent++;
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
 		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}

+	shard->nresponses_received++;
 	return (NeonResponse *) resp;
 }

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
 	 * ensuring we have received all but the last n requests (n = newsize).
 	 */
 	if (MyPState->n_requests_inflight > newsize)
-		prefetch_wait_for(MyPState->ring_unused - newsize);
+	{
+		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
+		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}

 	/* construct the new PrefetchState, and copy over the memory contexts */
 	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_last = newsize;
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
-	newPState->ring_flush = newsize;
 	newPState->max_shard_no = MyPState->max_shard_no;
 	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));

@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		}
 		newPState->n_unused -= 1;
 	}
+	newPState->ring_flush = newPState->ring_receive;

 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
 		if (slot->status == PRFS_RECEIVED)
 		{
 			pfree(slot->response);
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -16,15 +16,12 @@ use super::{Cache, Cached};
 use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
 use crate::control_plane::AuthSecret;
-use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};

 #[async_trait]
 pub(crate) trait ProjectInfoCache {
    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
-    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
-    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
-    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
    async fn decrement_active_listeners(&self);
    async fn increment_active_listeners(&self);
@@ -54,8 +51,6 @@ impl<T> From<T> for Entry<T> {
 struct EndpointInfo {
    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
-    block_public_or_vpc_access: Option<Entry<(bool, bool)>>,
-    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }

 impl EndpointInfo {
@@ -97,51 +92,6 @@ impl EndpointInfo {
        }
        None
    }
-
-    pub(crate) fn get_allowed_vpc_endpoint_ids(
-        &self,
-        valid_since: Instant,
-        ignore_cache_since: Option<Instant>,
-    ) -> Option<(Arc<Vec<String>>, bool)> {
-        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
-            if valid_since < allowed_vpc_endpoint_ids.created_at {
-                return Some((
-                    allowed_vpc_endpoint_ids.value.clone(),
-                    Self::check_ignore_cache(
-                        ignore_cache_since,
-                        allowed_vpc_endpoint_ids.created_at,
-                    ),
-                ));
-            }
-        }
-        None
-    }
-
-    pub(crate) fn get_block_public_or_vpc_access(
-        &self,
-        valid_since: Instant,
-        ignore_cache_since: Option<Instant>,
-    ) -> Option<((bool, bool), bool)> {
-        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
-            if valid_since < block_public_or_vpc_access.created_at {
-                return Some((
-                    block_public_or_vpc_access.value.clone(),
-                    Self::check_ignore_cache(
-                        ignore_cache_since,
-                        block_public_or_vpc_access.created_at,
-                    ),
-                ));
-            }
-        }
-        None
-    }
-
-    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
-        self.block_public_or_vpc_access = None;
-    }
-    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
-        self.allowed_vpc_endpoint_ids = None;
-    }
    pub(crate) fn invalidate_allowed_ips(&mut self) {
        self.allowed_ips = None;
    }
@@ -161,8 +111,6 @@ pub struct ProjectInfoCacheImpl {
    cache: DashMap<EndpointIdInt, EndpointInfo>,

    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
-    // FIXME(stefan): we need a way to GC the account2ep map.
-    account2ep: DashMap<AccountIdInt, HashSet<EndpointIdInt>>,
    config: ProjectInfoCacheOptions,

    start_time: Instant,
@@ -172,59 +120,6 @@ pub struct ProjectInfoCacheImpl {

 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
-        info!(
-            "invalidating allowed vpc endpoint ids for projects `{}`",
-            project_ids.join(", ")
-        );
-        for project_id in project_ids {
-            let endpoints = self
-                .project2ep
-                .get(&project_id)
-                .map(|kv| kv.value().clone())
-                .unwrap_or_default();
-            for endpoint_id in endpoints {
-                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
-                }
-            }
-        }
-    }
-
-    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
-        info!(
-            "invalidating allowed vpc endpoint ids for org `{}`",
-            account_id
-        );
-        let endpoints = self
-            .account2ep
-            .get(&account_id)
-            .map(|kv| kv.value().clone())
-            .unwrap_or_default();
-        for endpoint_id in endpoints {
-            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
-            }
-        }
-    }
-
-    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
-        info!(
-            "invalidating block public or vpc access for project `{}`",
-            project_id
-        );
-        let endpoints = self
-            .project2ep
-            .get(&project_id)
-            .map(|kv| kv.value().clone())
-            .unwrap_or_default();
-        for endpoint_id in endpoints {
-            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
-                endpoint_info.invalidate_block_public_or_vpc_access();
-            }
-        }
-    }
-
    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
        info!("invalidating allowed ips for project `{}`", project_id);
        let endpoints = self
@@ -283,7 +178,6 @@ impl ProjectInfoCacheImpl {
        Self {
            cache: DashMap::new(),
            project2ep: DashMap::new(),
-            account2ep: DashMap::new(),
            config,
            ttl_disabled_since_us: AtomicU64::new(u64::MAX),
            start_time: Instant::now(),
@@ -332,7 +226,6 @@ impl ProjectInfoCacheImpl {
        }
        Some(Cached::new_uncached(value))
    }
-
    pub(crate) fn insert_role_secret(
        &self,
        project_id: ProjectIdInt,
@@ -363,16 +256,6 @@ impl ProjectInfoCacheImpl {
        self.insert_project2endpoint(project_id, endpoint_id);
        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
    }
-    pub(crate) fn insert_vpc_allowed_endpoint_ids(&self, account_id: AccountIdInt, project_id: ProjectIdInt, endpoint_id: EndpointIdInt, vpc_allowed_endpoint_ids: HashSet<EndpointIdInt>) {
-        if self.cache.len() >= self.config.size {
-            // If there are too many entries, wait until the next gc cycle.
-            return;
-        }
-        self.insert_account2endpoint(account_id, endpoint_id);
-        self.insert_project2endpoint(project_id, endpoint_id);
-        self.cache.entry(endpoint_id).or_default().vpc_allowed_endpoint_ids = Some(vpc_allowed_endpoint_ids);
-    }
-    }
    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
            endpoints.insert(endpoint_id);
@@ -381,13 +264,6 @@ impl ProjectInfoCacheImpl {
                .insert(project_id, HashSet::from([endpoint_id]));
        }
    }
-    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
-        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
-            endpoints.insert(endpoint_id);
-        } else {
-            self.account2ep.insert(account_id, HashSet::from([endpoint_id]));
-        }
-    }
    fn get_cache_times(&self) -> (Instant, Option<Instant>) {
        let mut valid_since = Instant::now() - self.config.ttl;
        // Only ignore cache if ttl is disabled.
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -238,8 +238,6 @@ pub(crate) struct GetEndpointAccessControl {
    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
    pub(crate) project_id: Option<ProjectIdInt>,
    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
-    pub(crate) block_public_connections: Option<bool>,
-    pub(crate) block_vpc_connections: Option<bool>,
 }

 // Manually implement debug to omit sensitive info.
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;

-use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{BranchId, EndpointId, ProjectId, RoleName};

 pub trait InternId: Sized + 'static {
    fn get_interner() -> &'static StringInterner<Self>;
@@ -206,26 +206,6 @@ impl From<ProjectId> for ProjectIdInt {
    }
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct AccountIdTag;
-impl InternId for AccountIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type AccountIdInt = InternedString<AccountIdTag>;
-impl From<&AccountId> for AccountIdInt {
-    fn from(value: &AccountId) -> Self {
-        AccountIdTag::get_interner().get_or_intern(value)
-    }
-}
-impl From<AccountId> for AccountIdInt {
-    fn from(value: AccountId) -> Self {
-        AccountIdTag::get_interner().get_or_intern(&value)
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::sync::OnceLock;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -556,9 +556,6 @@ pub enum RedisEventsCount {
    CancelSession,
    PasswordUpdate,
    AllowedIpsUpdate,
-    AllowedVpcEndpointIdsUpdateForProjects,
-    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
-    BlockPublicOrVpcAccessUpdate,
 }

 pub struct ThreadPoolWorkers(usize);
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use uuid::Uuid;
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
 use crate::cancellation::{CancelMap, CancellationHandler};
-use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use tracing::Instrument;

@@ -39,27 +39,6 @@ pub(crate) enum Notification {
    AllowedIpsUpdate {
        allowed_ips_update: AllowedIpsUpdate,
    },
-    #[serde(
-        rename = "/allowed_vpc_endpoint_ids_updated_for_projects",
-        deserialize_with = "deserialize_json_string"
-    )]
-    AllowedVpcEndpointIdsUpdateForProjects {
-        allowed_vpc_endpoint_ids_update_for_projects: AllowedVpcEndpointIdsUpdateForProjects,
-    },
-    #[serde(
-        rename = "/allowed_vpc_endpoint_ids_updated_for_org",
-        deserialize_with = "deserialize_json_string"
-    )]
-    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
-        allowed_vpc_endpoint_ids_update_for_org: AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
-    },
-    #[serde(
-        rename = "/block_public_or_vpc_access_updated",
-        deserialize_with = "deserialize_json_string"
-    )]
-    BlockPublicOrVpcAccessUpdate {
-        block_public_or_vpc_access_update: BlockPublicOrVpcAccessUpdate,
-    },
    #[serde(
        rename = "/password_updated",
        deserialize_with = "deserialize_json_string"
@@ -72,22 +51,6 @@ pub(crate) enum Notification {
 pub(crate) struct AllowedIpsUpdate {
    project_id: ProjectIdInt,
 }
-
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
-pub(crate) struct AllowedVpcEndpointIdsUpdateForProjects {
-    project_ids: Vec<ProjectIdInt>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
-pub(crate) struct AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
-    account_id: AccountIdInt,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
-pub(crate) struct BlockPublicOrVpcAccessUpdate {
-    project_id: ProjectIdInt,
-}
-
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct PasswordUpdate {
    project_id: ProjectIdInt,
@@ -201,11 +164,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                    }
                }
            }
-            Notification::AllowedIpsUpdate { .. }
-            | Notification::PasswordUpdate { .. }
-            | Notification::AllowedVpcEndpointIdsUpdateForProjects { .. }
-            | Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg { .. }
-            | Notification::BlockPublicOrVpcAccessUpdate { .. } => {
+            Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => {
                invalidate_cache(self.cache.clone(), msg.clone());
                if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                    Metrics::get()
@@ -217,27 +176,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                        .proxy
                        .redis_events_count
                        .inc(RedisEventsCount::PasswordUpdate);
-                } else if matches!(
-                    msg,
-                    Notification::AllowedVpcEndpointIdsUpdateForProjects { .. }
-                ) {
-                    Metrics::get()
-                        .proxy
-                        .redis_events_count
-                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
-                } else if matches!(
-                    msg,
-                    Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg { .. }
-                ) {
-                    Metrics::get()
-                        .proxy
-                        .redis_events_count
-                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
-                } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdate { .. }) {
-                    Metrics::get()
-                        .proxy
-                        .redis_events_count
-                        .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
                }
                // It might happen that the invalid entry is on the way to be cached.
                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
@@ -259,21 +197,6 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
        Notification::AllowedIpsUpdate { allowed_ips_update } => {
            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
        }
-        Notification::AllowedVpcEndpointIdsUpdateForProjects {
-            allowed_vpc_endpoint_ids_update_for_projects,
-        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
-            allowed_vpc_endpoint_ids_update_for_projects.project_ids,
-        ),
-        Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
-            allowed_vpc_endpoint_ids_update_for_org,
-        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
-            allowed_vpc_endpoint_ids_update_for_org.account_id,
-        ),
-        Notification::BlockPublicOrVpcAccessUpdate {
-            block_public_or_vpc_access_update,
-        } => cache.invalidate_block_public_or_vpc_access_for_project(
-            block_public_or_vpc_access_update.project_id,
-        ),
        Notification::PasswordUpdate { password_update } => cache
            .invalidate_role_secret_for_project(
                password_update.project_id,
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -97,8 +97,6 @@ smol_str_wrapper!(EndpointId);
 smol_str_wrapper!(BranchId);
 // 90% of project strings are 23 characters or less.
 smol_str_wrapper!(ProjectId);
-// 90% of account strings are 23 characters or less.
-smol_str_wrapper!(AccountId);

 // will usually equal endpoint ID
 smol_str_wrapper!(EndpointCacheKey);
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -86,6 +86,8 @@ enum Command {
        /// For safekeeper node_kind only, json list of timelines and their lsn info
        #[arg(long, default_value = None)]
        timeline_lsns: Option<String>,
+        #[arg(long, default_value_t = false)]
+        verbose: bool,
    },
    TenantSnapshot {
        #[arg(long = "tenant-id")]
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
            dump_db_connstr,
            dump_db_table,
            timeline_lsns,
+            verbose,
        } => {
            if let NodeKind::Safekeeper = node_kind {
                let db_or_list = match (timeline_lsns, dump_db_connstr) {
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
                    tenant_ids,
                    json,
                    post_to_storcon,
+                    verbose,
                    cli.exit_code,
                )
                .await
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
        Vec::new(),
        true,
        post_to_storcon,
+        false, // default to non-verbose mode
        exit_code,
    )
    .await?;
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
    tenant_shard_ids: Vec<TenantShardId>,
    json: bool,
    post_to_storcon: bool,
+    verbose: bool,
    exit_code: bool,
 ) -> anyhow::Result<()> {
    if controller_client.is_none() && post_to_storcon {
        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
    }
-    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
        Err(e) => {
            tracing::error!("Failed: {e}");
            Err(e)
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -21,8 +21,12 @@ pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
    timeline_shard_count: usize,
-    with_errors: HashSet<TenantShardTimelineId>,
-    with_warnings: HashSet<TenantShardTimelineId>,
+    /// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_errors: HashMap<String, Vec<String>>,
+    /// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_warnings: HashMap<String, Vec<String>>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,

@@ -52,7 +56,12 @@ impl MetadataSummary {
        }
    }

-    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(
+        &mut self,
+        id: &TenantShardTimelineId,
+        analysis: &TimelineAnalysis,
+        verbose: bool,
+    ) {
        if analysis.is_healthy() {
            self.healthy_tenant_shards.insert(id.tenant_shard_id);
        } else {
@@ -61,11 +70,17 @@ impl MetadataSummary {
        }

        if !analysis.errors.is_empty() {
-            self.with_errors.insert(*id);
+            let entry = self.with_errors.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.errors.iter().cloned());
+            }
        }

        if !analysis.warnings.is_empty() {
-            self.with_warnings.insert(*id);
+            let entry = self.with_warnings.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.warnings.iter().cloned());
+            }
        }
    }

@@ -120,6 +135,7 @@ Index versions: {version_summary}
 pub async fn scan_pageserver_metadata(
    bucket_config: BucketConfig,
    tenant_ids: Vec<TenantShardId>,
+    verbose: bool,
 ) -> anyhow::Result<MetadataSummary> {
    let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;

@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
        mut tenant_objects: TenantObjectListing,
        timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
        highest_shard_count: ShardCount,
+        verbose: bool,
    ) {
        summary.tenant_count += 1;

@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
                        Some(data),
                    )
                    .await;
-                    summary.update_analysis(&ttid, &analysis);
+                    summary.update_analysis(&ttid, &analysis, verbose);

                    timeline_ids.insert(ttid.timeline_id);
                } else {
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
        summary.update_data(&data);

        match tenant_id {
-            None => {
-                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
-                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
-            }
            Some(prev_tenant_id) => {
                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
                    // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
                        tenant_objects,
                        timelines,
                        highest_shard_count,
+                        verbose,
                    )
                    .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                    .await;
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
                    highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
                }
            }
+            None => {
+                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
+            }
        }

        match &data.blob_data {
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
            tenant_objects,
            tenant_timeline_results,
            highest_shard_count,
+            verbose,
        )
        .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
        .await;
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4556,6 +4556,7 @@ class StorageScrubber:
    def __init__(self, env: NeonEnv, log_dir: Path):
        self.env = env
        self.log_dir = log_dir
+        self.allowed_errors: list[str] = []

    def scrubber_cli(
        self, args: list[str], timeout, extra_env: dict[str, str] | None = None
@@ -4633,19 +4634,70 @@ class StorageScrubber:
        if timeline_lsns is not None:
            args.append("--timeline-lsns")
            args.append(json.dumps(timeline_lsns))
+        if node_kind == NodeKind.PAGESERVER:
+            args.append("--verbose")
        stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)

        try:
            summary = json.loads(stdout)
-            # summary does not contain "with_warnings" if node_kind is the safekeeper
-            no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
-            healthy = not summary["with_errors"] and no_warnings
+            healthy = self._check_run_healthy(summary)
            return healthy, summary
        except:
            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
            log.error(stdout)
            raise

+    def _check_line_allowed(self, line: str) -> bool:
+        for a in self.allowed_errors:
+            try:
+                if re.match(a, line):
+                    return True
+            except re.error:
+                log.error(f"Invalid regex: '{a}'")
+                raise
+        return False
+
+    def _check_line_list_allowed(self, lines: list[str]) -> bool:
+        for line in lines:
+            if not self._check_line_allowed(line):
+                return False
+        return True
+
+    def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
+        # summary does not contain "with_warnings" if node_kind is the safekeeper
+        healthy = True
+        with_warnings = summary.get("with_warnings", None)
+        if with_warnings is not None:
+            if isinstance(with_warnings, list):
+                if len(with_warnings) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, warnings in with_warnings.items():
+                    assert (
+                        len(warnings) > 0
+                    ), "with_warnings value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(warnings):
+                        healthy = False
+                        break
+        if not healthy:
+            return healthy
+        with_errors = summary.get("with_errors", None)
+        if with_errors is not None:
+            if isinstance(with_errors, list):
+                if len(with_errors) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, errors in with_errors.items():
+                    assert (
+                        len(errors) > 0
+                    ), "with_errors value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(errors):
+                        healthy = False
+                        break
+        return healthy
+
    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
        stdout = self.scrubber_cli(
            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
--- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -0,0 +1,124 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+BTREE_NUM_CYCLEID_PAGES = """
+    WITH raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    ),
+    parsed_pages AS (
+        /* cycle ID is the last 2 bytes of the btree page */
+        SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
+        FROM raw_pages
+    )
+    SELECT count(*),
+           encode(cycle_id, 'hex')
+     FROM parsed_pages
+    WHERE encode(cycle_id, 'hex') != '0000'
+    GROUP BY encode(cycle_id, 'hex');
+    """
+
+
+def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    ses1 = endpoint.connect().cursor()
+    ses1.execute("ALTER SYSTEM SET autovacuum = off;")
+    ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
+    ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
+    ses1.execute("SELECT pg_reload_conf();")
+    ses1.execute("CREATE EXTENSION neon_test_utils;")
+    # prepare a large index
+    ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
+    ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
+
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 0
+    ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
+    # Delete enough tuples to clear the first index page.
+    # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
+    ses1.execute("DELETE FROM t WHERE id <= 406;")
+    # Make sure the page is cleaned up
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Do another delete-then-indexcleanup cycle, to move the pages from
+    # "dead" to "reusable"
+    ses1.execute("DELETE FROM t WHERE id <= 446;")
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Make sure the vacuum we're about to trigger in s3 has cleanup work to do
+    ses1.execute("DELETE FROM t WHERE id <= 610;")
+
+    # Flush wal, for checking purposes
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
+
+    ses2 = endpoint.connect().cursor()
+    ses3 = endpoint.connect().cursor()
+
+    # Session 2 pins a btree page, which prevents vacuum from processing that
+    # page, thus allowing us to reliably split pages while a concurrent vacuum
+    # is running.
+    ses2.execute("BEGIN;")
+    ses2.execute(
+        "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
+    )
+    ses2.execute("FETCH FROM foo;")  # pins the leaf page with id 611
+    wait_evt = threading.Event()
+
+    # Session 3 runs the VACUUM command. Note that this will block, and
+    # therefore must run on another thread.
+    # We rely on this running quickly enough to hit the pinned page from
+    # session 2 by the time we start other work again in session 1, but
+    # technically there is a race where the thread (and/or PostgreSQL process)
+    # don't get to that pinned page with vacuum until >2s after evt.set() was
+    # called, and session 1 thus might already have split pages.
+    def vacuum_freeze_t(ses3, evt: threading.Event):
+        # Begin parallel vacuum that should hit the index
+        evt.set()
+        # this'll hang until s2 fetches enough new data from its cursor.
+        # this is technically a race with the time.sleep(2) below, but if this
+        # command doesn't hit
+        ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
+
+    ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
+    ses3t.start()
+    wait_evt.wait()
+    # Make extra sure we got the thread started and vacuum is stuck, by waiting
+    # some time even after wait_evt got set. This isn't truly reliable (it is
+    # possible
+    time.sleep(2)
+
+    # Insert 2 pages worth of new data.
+    # This should reuse the one empty page, plus another page at the end of
+    # the index relation; with split ordering
+    #    old_blk -> blkno=1 -> old_blk + 1.
+    # As this is run while vacuum in session 3 is happening, these splits
+    # should receive cycle IDs where applicable.
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
+    # unpin the btree page, allowing s3's vacuum to complete
+    ses2.execute("FETCH ALL FROM foo;")
+    ses2.execute("ROLLBACK;")
+    # flush WAL to make sure PS is up-to-date
+    ses1.execute("SELECT neon_xlogflush();")
+    # check that our expectations are correct
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 1 and pages[0][0] == 3
+    ), f"3 page splits with cycle ID expected; actual {pages}"
+
+    # final cleanup
+    ses3t.join()
+    ses1.close()
+    ses2.close()
+    ses3.close()
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata(
    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

-    neon_env_builder.disable_scrub_on_exit()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert not healthy
+    env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*")
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
+
+    neon_env_builder.disable_scrub_on_exit()  # We already ran scrubber, no need to do an extra run
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.2",
-    "a10d95be67265e0f10a422ba0457f5a7af01de71"
+    "01fa3c48664ca030cfb69bb4a350aa9df4691d88"
  ],
  "v16": [
    "16.6",
-    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
+    "81428621f7c04aed03671cf80a928e0a36d92505"
  ],
  "v15": [
    "15.10",
-    "972e325e62b455957adbbdd8580e31275bb5b8c9"
+    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
  ],
  "v14": [
    "14.15",
-    "373f9decad933d2d46f321231032ae8b0da81acd"
+    "13ff324150fceaac72920e01742addc053db9462"
  ]
 }
Author	SHA1	Message	Date
Arseny Sher	bb45db3982	Increase neon_local http client to compute timeout in reconfigure. Seems like 30s sometimes not enough when CI runners are overloaded, causing pull_timeline flakiness. ref https://github.com/neondatabase/neon/issues/9731#issuecomment-2532143008	2024-12-11 14:10:37 +01:00
Matthias van de Meent	597125e124	Disable readstream's reliance on seqscan readahead (#9860 ) Neon doesn't have seqscan detection of its own, so stop read_stream from trying to utilize that readahead, and instead make it issue readahead of its own. ## Problem @knizhnik noticed that we didn't issue smgrprefetch[v] calls for seqscans in PG17 due to the move to the read_stream API, which assumes that the underlying IO facilities do seqscan detection for readahead. That is a wrong assumption when Neon is involved, so let's remove the code that applies that assumption. ## Summary of changes Remove the cases where seqscans are detected and prefetch is disabled as a consequence, and instead don't do that detection. PG PR: https://github.com/neondatabase/postgres/pull/532	2024-12-11 00:51:05 +00:00
Matthias van de Meent	e71d20d392	Emit nbtree vacuum cycle id in nbtree xlog through forced FPIs (#9932 ) This fixes neondatabase/neon#9929. ## Postgres repo PRS: - PG17: https://github.com/neondatabase/postgres/pull/538 - PG16: https://github.com/neondatabase/postgres/pull/539 - PG15: https://github.com/neondatabase/postgres/pull/540 - PG14: https://github.com/neondatabase/postgres/pull/541 ## Problem see #9929 ## Summary of changes We update the split code to force the code to emit an FPI whenever the cycle ID might be interesting for concurrent btree vacuum.	2024-12-10 19:42:52 +00:00
Alex Chi Z.	aa0554fd1e	feat(test_runner): allowed_errors in storage scrubber (#10062 ) ## Problem resolve https://github.com/neondatabase/neon/issues/9988#issuecomment-2528239437 ## Summary of changes * New verbose mode for storage scrubber scan metadata (pageserver) that contains the error messages. * Filter allowed_error list from the JSON output to determine the healthy flag status. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-12-10 17:00:47 +00:00
Heikki Linnakangas	b853f78136	Print a log message if GetPage response takes too long (#10046 ) We have metrics for GetPage request latencies, but this is an extra measure to capture requests that take way too long in the logs. The log message is printed every 10 s, until the response is received: ``` PG:2024-12-09 16:02:07.715 GMT [1782845] LOG: [NEON_SMGR] [shard 0] no response received from pageserver for 10.000 s, still waiting (sent 10613 requests, received 10612 responses) PG:2024-12-09 16:02:17.723 GMT [1782845] LOG: [NEON_SMGR] [shard 0] no response received from pageserver for 20.008 s, still waiting (sent 10613 requests, received 10612 responses) PG:2024-12-09 16:02:19.719 GMT [1782845] LOG: [NEON_SMGR] [shard 0] received response from pageserver after 22.006 s ```	2024-12-10 16:26:56 +00:00
Alex Chi Z.	6ad99826c1	fix(pageserver): refresh_gc_info should always increase cutoff (#9862 ) ## Problem close https://github.com/neondatabase/cloud/issues/19671 ``` Timeline ----------------------------- ^ last GC happened LSN ^ original retention period setting = 24hr > refresh-gc-info updates the gc_info ^ planned cutoff (gc_info) ^ customer set retention to 48hr, and it's still within the last GC LSN ^1 ^2 we have two choices: (1) update the planned cutoff to move backwards, or (2) keep the current one ``` In this patch, we decided to keep the current cutoff instead of moving back the gc_info to avoid races. In the future, we could allow the planned gc cutoff to go back once cplane sends a retention_history tenant config update, but this requires a careful revisit of the code. ## Summary of changes Ensure that GC cutoffs never go back if retention settings get changed. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-12-10 15:23:26 +00:00
Konstantin Knizhnik	311ee793b9	Fix handling in-flight requersts in prefetch buffer resize (#9968 ) ## Problem See https://github.com/neondatabase/neon/issues/9961 Current implementation of prefetch buffer resize doesn't correctly handle in-flight requests ## Summary of changes 1. Fix index of entry we should wait for if new prefetch buffer size is smaller than number of in-flight requests. 2. Correctly set flush position Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-12-10 15:01:40 +00:00