Compare commits

..

7 Commits

Author SHA1 Message Date
Arseny Sher
bb45db3982 Increase neon_local http client to compute timeout in reconfigure.
Seems like 30s sometimes not enough when CI runners are overloaded,
causing pull_timeline flakiness.

ref https://github.com/neondatabase/neon/issues/9731#issuecomment-2532143008
2024-12-11 14:10:37 +01:00
Matthias van de Meent
597125e124 Disable readstream's reliance on seqscan readahead (#9860)
Neon doesn't have seqscan detection of its own, so stop read_stream from
trying to utilize that readahead, and instead make it issue readahead of
its own.

## Problem

@knizhnik noticed that we didn't issue smgrprefetch[v] calls for
seqscans in PG17 due to the move to the read_stream API, which assumes
that the underlying IO facilities do seqscan detection for readahead.
That is a wrong assumption when Neon is involved, so let's remove the
code that applies that assumption.

## Summary of changes
Remove the cases where seqscans are detected and prefetch is disabled as
a consequence, and instead don't do that detection.

PG PR: https://github.com/neondatabase/postgres/pull/532
2024-12-11 00:51:05 +00:00
Matthias van de Meent
e71d20d392 Emit nbtree vacuum cycle id in nbtree xlog through forced FPIs (#9932)
This fixes neondatabase/neon#9929.

## Postgres repo PRS:
- PG17: https://github.com/neondatabase/postgres/pull/538
- PG16: https://github.com/neondatabase/postgres/pull/539
- PG15: https://github.com/neondatabase/postgres/pull/540
- PG14: https://github.com/neondatabase/postgres/pull/541

## Problem
see #9929 

## Summary of changes

We update the split code to force the code to emit an FPI whenever the
cycle ID might be interesting for concurrent btree vacuum.
2024-12-10 19:42:52 +00:00
Alex Chi Z.
aa0554fd1e feat(test_runner): allowed_errors in storage scrubber (#10062)
## Problem

resolve
https://github.com/neondatabase/neon/issues/9988#issuecomment-2528239437

## Summary of changes

* New verbose mode for storage scrubber scan metadata (pageserver) that
contains the error messages.
* Filter allowed_error list from the JSON output to determine the
healthy flag status.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-12-10 17:00:47 +00:00
Heikki Linnakangas
b853f78136 Print a log message if GetPage response takes too long (#10046)
We have metrics for GetPage request latencies, but this is an extra
measure to capture requests that take way too long in the logs. The log
message is printed every 10 s, until the response is received:

```
PG:2024-12-09 16:02:07.715 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 10.000 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:17.723 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 20.008 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:19.719 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] received response from pageserver after 22.006 s
```
2024-12-10 16:26:56 +00:00
Alex Chi Z.
6ad99826c1 fix(pageserver): refresh_gc_info should always increase cutoff (#9862)
## Problem

close https://github.com/neondatabase/cloud/issues/19671

```
Timeline -----------------------------
         ^ last GC happened LSN
              ^ original retention period setting = 24hr
> refresh-gc-info updates the gc_info
              ^ planned cutoff (gc_info)
         ^ customer set retention to 48hr, and it's still within the last GC LSN
         ^1   ^2 we have two choices: (1) update the planned cutoff to
                 move backwards, or (2) keep the current one
```

In this patch, we decided to keep the current cutoff instead of moving
back the gc_info to avoid races. In the future, we could allow the
planned gc cutoff to go back once cplane sends a retention_history
tenant config update, but this requires a careful revisit of the code.

## Summary of changes

Ensure that GC cutoffs never go back if retention settings get changed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-12-10 15:23:26 +00:00
Konstantin Knizhnik
311ee793b9 Fix handling in-flight requersts in prefetch buffer resize (#9968)
## Problem

See https://github.com/neondatabase/neon/issues/9961
Current implementation of prefetch buffer resize doesn't correctly
handle in-flight requests

## Summary of changes

1. Fix index of entry we should wait for if new prefetch buffer size is
smaller than number of in-flight requests.
2. Correctly set flush position

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2024-12-10 15:01:40 +00:00
20 changed files with 314 additions and 260 deletions

View File

@@ -810,7 +810,7 @@ impl Endpoint {
}
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.timeout(Duration::from_secs(120))
.build()
.unwrap();
let response = client

View File

@@ -4506,7 +4506,12 @@ impl Tenant {
// - this timeline was created while we were finding cutoffs
// - lsn for timestamp search fails for this timeline repeatedly
if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
target.cutoffs = cutoffs.clone();
let original_cutoffs = target.cutoffs.clone();
// GC cutoffs should never go back
target.cutoffs = GcCutoffs {
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
}
}
}

View File

@@ -22,6 +22,7 @@
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/interrupt.h"
#include "storage/buf_internals.h"
#include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
*/
PSConnectionState state;
PGconn *conn;
/* request / response counters for debugging */
uint64 nrequests_sent;
uint64 nresponses_received;
/*---
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
}
shard->state = PS_Connected;
shard->nrequests_sent = 0;
shard->nresponses_received = 0;
}
/* FALLTHROUGH */
case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
int ret;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
instr_time now,
start_ts,
since_start,
last_log_ts,
since_last_log;
bool logged = false;
/*
* As a debugging aid, if we don't get a response for a long time, print a
* log message.
*
* 10 s is a very generous threshold, normally we expect a response in a
* few milliseconds. We have metrics to track latencies in normal ranges,
* but in the cases that take exceptionally long, it's useful to log the
* exact timestamps.
*/
#define LOG_INTERVAL_US UINT64CONST(10 * 1000000)
INSTR_TIME_SET_CURRENT(now);
start_ts = last_log_ts = now;
INSTR_TIME_SET_ZERO(since_last_log);
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
if (ret == 0)
{
WaitEvent event;
long timeout;
timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
WAIT_EVENT_NEON_PS_READ);
ResetLatch(MyLatch);
@@ -684,9 +716,40 @@ retry:
}
}
/*
* Print a message to the log if a long time has passed with no
* response.
*/
INSTR_TIME_SET_CURRENT(now);
since_last_log = now;
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
{
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
INSTR_TIME_GET_DOUBLE(since_start),
shard->nrequests_sent, shard->nresponses_received);
last_log_ts = now;
logged = true;
}
goto retry;
}
/*
* If we logged earlier that the response is taking a long time, log
* another message when the response is finally received.
*/
if (logged)
{
INSTR_TIME_SET_CURRENT(now);
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
INSTR_TIME_GET_DOUBLE(since_start));
}
return ret;
}
@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
* point, but on the grand scheme of things it's only a small issue.
*/
shard->nrequests_sent++;
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
}
shard->nresponses_received++;
return (NeonResponse *) resp;
}

View File

@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
* ensuring we have received all but the last n requests (n = newsize).
*/
if (MyPState->n_requests_inflight > newsize)
prefetch_wait_for(MyPState->ring_unused - newsize);
{
Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
Assert(MyPState->n_requests_inflight <= newsize);
}
/* construct the new PrefetchState, and copy over the memory contexts */
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
newPState->ring_last = newsize;
newPState->ring_unused = newsize;
newPState->ring_receive = newsize;
newPState->ring_flush = newsize;
newPState->max_shard_no = MyPState->max_shard_no;
memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
}
newPState->n_unused -= 1;
}
newPState->ring_flush = newPState->ring_receive;
MyNeonCounters->getpage_prefetches_buffered =
MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
{
PrefetchRequest *slot = GetPrfSlot(end);
Assert(slot->status != PRFS_REQUESTED);
if (slot->status == PRFS_RECEIVED)
{
pfree(slot->response);

View File

@@ -16,15 +16,12 @@ use super::{Cache, Cached};
use crate::auth::IpPattern;
use crate::config::ProjectInfoCacheOptions;
use crate::control_plane::AuthSecret;
use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
use crate::types::{EndpointId, RoleName};
#[async_trait]
pub(crate) trait ProjectInfoCache {
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
async fn decrement_active_listeners(&self);
async fn increment_active_listeners(&self);
@@ -54,8 +51,6 @@ impl<T> From<T> for Entry<T> {
struct EndpointInfo {
secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
block_public_or_vpc_access: Option<Entry<(bool, bool)>>,
allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
}
impl EndpointInfo {
@@ -97,51 +92,6 @@ impl EndpointInfo {
}
None
}
pub(crate) fn get_allowed_vpc_endpoint_ids(
&self,
valid_since: Instant,
ignore_cache_since: Option<Instant>,
) -> Option<(Arc<Vec<String>>, bool)> {
if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
if valid_since < allowed_vpc_endpoint_ids.created_at {
return Some((
allowed_vpc_endpoint_ids.value.clone(),
Self::check_ignore_cache(
ignore_cache_since,
allowed_vpc_endpoint_ids.created_at,
),
));
}
}
None
}
pub(crate) fn get_block_public_or_vpc_access(
&self,
valid_since: Instant,
ignore_cache_since: Option<Instant>,
) -> Option<((bool, bool), bool)> {
if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
if valid_since < block_public_or_vpc_access.created_at {
return Some((
block_public_or_vpc_access.value.clone(),
Self::check_ignore_cache(
ignore_cache_since,
block_public_or_vpc_access.created_at,
),
));
}
}
None
}
pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
self.block_public_or_vpc_access = None;
}
pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
self.allowed_vpc_endpoint_ids = None;
}
pub(crate) fn invalidate_allowed_ips(&mut self) {
self.allowed_ips = None;
}
@@ -161,8 +111,6 @@ pub struct ProjectInfoCacheImpl {
cache: DashMap<EndpointIdInt, EndpointInfo>,
project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
// FIXME(stefan): we need a way to GC the account2ep map.
account2ep: DashMap<AccountIdInt, HashSet<EndpointIdInt>>,
config: ProjectInfoCacheOptions,
start_time: Instant,
@@ -172,59 +120,6 @@ pub struct ProjectInfoCacheImpl {
#[async_trait]
impl ProjectInfoCache for ProjectInfoCacheImpl {
fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
info!(
"invalidating allowed vpc endpoint ids for projects `{}`",
project_ids.join(", ")
);
for project_id in project_ids {
let endpoints = self
.project2ep
.get(&project_id)
.map(|kv| kv.value().clone())
.unwrap_or_default();
for endpoint_id in endpoints {
if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
endpoint_info.invalidate_allowed_vpc_endpoint_ids();
}
}
}
}
fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
info!(
"invalidating allowed vpc endpoint ids for org `{}`",
account_id
);
let endpoints = self
.account2ep
.get(&account_id)
.map(|kv| kv.value().clone())
.unwrap_or_default();
for endpoint_id in endpoints {
if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
endpoint_info.invalidate_allowed_vpc_endpoint_ids();
}
}
}
fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
info!(
"invalidating block public or vpc access for project `{}`",
project_id
);
let endpoints = self
.project2ep
.get(&project_id)
.map(|kv| kv.value().clone())
.unwrap_or_default();
for endpoint_id in endpoints {
if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
endpoint_info.invalidate_block_public_or_vpc_access();
}
}
}
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
info!("invalidating allowed ips for project `{}`", project_id);
let endpoints = self
@@ -283,7 +178,6 @@ impl ProjectInfoCacheImpl {
Self {
cache: DashMap::new(),
project2ep: DashMap::new(),
account2ep: DashMap::new(),
config,
ttl_disabled_since_us: AtomicU64::new(u64::MAX),
start_time: Instant::now(),
@@ -332,7 +226,6 @@ impl ProjectInfoCacheImpl {
}
Some(Cached::new_uncached(value))
}
pub(crate) fn insert_role_secret(
&self,
project_id: ProjectIdInt,
@@ -363,16 +256,6 @@ impl ProjectInfoCacheImpl {
self.insert_project2endpoint(project_id, endpoint_id);
self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
}
pub(crate) fn insert_vpc_allowed_endpoint_ids(&self, account_id: AccountIdInt, project_id: ProjectIdInt, endpoint_id: EndpointIdInt, vpc_allowed_endpoint_ids: HashSet<EndpointIdInt>) {
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
}
self.insert_account2endpoint(account_id, endpoint_id);
self.insert_project2endpoint(project_id, endpoint_id);
self.cache.entry(endpoint_id).or_default().vpc_allowed_endpoint_ids = Some(vpc_allowed_endpoint_ids);
}
}
fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
endpoints.insert(endpoint_id);
@@ -381,13 +264,6 @@ impl ProjectInfoCacheImpl {
.insert(project_id, HashSet::from([endpoint_id]));
}
}
fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
endpoints.insert(endpoint_id);
} else {
self.account2ep.insert(account_id, HashSet::from([endpoint_id]));
}
}
fn get_cache_times(&self) -> (Instant, Option<Instant>) {
let mut valid_since = Instant::now() - self.config.ttl;
// Only ignore cache if ttl is disabled.

View File

@@ -238,8 +238,6 @@ pub(crate) struct GetEndpointAccessControl {
pub(crate) allowed_ips: Option<Vec<IpPattern>>,
pub(crate) project_id: Option<ProjectIdInt>,
pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
pub(crate) block_public_connections: Option<bool>,
pub(crate) block_vpc_connections: Option<bool>,
}
// Manually implement debug to omit sensitive info.

View File

@@ -7,7 +7,7 @@ use std::sync::OnceLock;
use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
use rustc_hash::FxHasher;
use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};
use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
pub trait InternId: Sized + 'static {
fn get_interner() -> &'static StringInterner<Self>;
@@ -206,26 +206,6 @@ impl From<ProjectId> for ProjectIdInt {
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct AccountIdTag;
impl InternId for AccountIdTag {
fn get_interner() -> &'static StringInterner<Self> {
static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
ROLE_NAMES.get_or_init(Default::default)
}
}
pub type AccountIdInt = InternedString<AccountIdTag>;
impl From<&AccountId> for AccountIdInt {
fn from(value: &AccountId) -> Self {
AccountIdTag::get_interner().get_or_intern(value)
}
}
impl From<AccountId> for AccountIdInt {
fn from(value: AccountId) -> Self {
AccountIdTag::get_interner().get_or_intern(&value)
}
}
#[cfg(test)]
mod tests {
use std::sync::OnceLock;

View File

@@ -556,9 +556,6 @@ pub enum RedisEventsCount {
CancelSession,
PasswordUpdate,
AllowedIpsUpdate,
AllowedVpcEndpointIdsUpdateForProjects,
AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
BlockPublicOrVpcAccessUpdate,
}
pub struct ThreadPoolWorkers(usize);

View File

@@ -11,7 +11,7 @@ use uuid::Uuid;
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use crate::cache::project_info::ProjectInfoCache;
use crate::cancellation::{CancelMap, CancellationHandler};
use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
use crate::intern::{ProjectIdInt, RoleNameInt};
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
use tracing::Instrument;
@@ -39,27 +39,6 @@ pub(crate) enum Notification {
AllowedIpsUpdate {
allowed_ips_update: AllowedIpsUpdate,
},
#[serde(
rename = "/allowed_vpc_endpoint_ids_updated_for_projects",
deserialize_with = "deserialize_json_string"
)]
AllowedVpcEndpointIdsUpdateForProjects {
allowed_vpc_endpoint_ids_update_for_projects: AllowedVpcEndpointIdsUpdateForProjects,
},
#[serde(
rename = "/allowed_vpc_endpoint_ids_updated_for_org",
deserialize_with = "deserialize_json_string"
)]
AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
allowed_vpc_endpoint_ids_update_for_org: AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
},
#[serde(
rename = "/block_public_or_vpc_access_updated",
deserialize_with = "deserialize_json_string"
)]
BlockPublicOrVpcAccessUpdate {
block_public_or_vpc_access_update: BlockPublicOrVpcAccessUpdate,
},
#[serde(
rename = "/password_updated",
deserialize_with = "deserialize_json_string"
@@ -72,22 +51,6 @@ pub(crate) enum Notification {
pub(crate) struct AllowedIpsUpdate {
project_id: ProjectIdInt,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub(crate) struct AllowedVpcEndpointIdsUpdateForProjects {
project_ids: Vec<ProjectIdInt>,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub(crate) struct AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
account_id: AccountIdInt,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub(crate) struct BlockPublicOrVpcAccessUpdate {
project_id: ProjectIdInt,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub(crate) struct PasswordUpdate {
project_id: ProjectIdInt,
@@ -201,11 +164,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
}
}
}
Notification::AllowedIpsUpdate { .. }
| Notification::PasswordUpdate { .. }
| Notification::AllowedVpcEndpointIdsUpdateForProjects { .. }
| Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg { .. }
| Notification::BlockPublicOrVpcAccessUpdate { .. } => {
Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => {
invalidate_cache(self.cache.clone(), msg.clone());
if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
Metrics::get()
@@ -217,27 +176,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
.proxy
.redis_events_count
.inc(RedisEventsCount::PasswordUpdate);
} else if matches!(
msg,
Notification::AllowedVpcEndpointIdsUpdateForProjects { .. }
) {
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
} else if matches!(
msg,
Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg { .. }
) {
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
} else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdate { .. }) {
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
}
// It might happen that the invalid entry is on the way to be cached.
// To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
@@ -259,21 +197,6 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
Notification::AllowedIpsUpdate { allowed_ips_update } => {
cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
}
Notification::AllowedVpcEndpointIdsUpdateForProjects {
allowed_vpc_endpoint_ids_update_for_projects,
} => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
allowed_vpc_endpoint_ids_update_for_projects.project_ids,
),
Notification::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg {
allowed_vpc_endpoint_ids_update_for_org,
} => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
allowed_vpc_endpoint_ids_update_for_org.account_id,
),
Notification::BlockPublicOrVpcAccessUpdate {
block_public_or_vpc_access_update,
} => cache.invalidate_block_public_or_vpc_access_for_project(
block_public_or_vpc_access_update.project_id,
),
Notification::PasswordUpdate { password_update } => cache
.invalidate_role_secret_for_project(
password_update.project_id,

View File

@@ -97,8 +97,6 @@ smol_str_wrapper!(EndpointId);
smol_str_wrapper!(BranchId);
// 90% of project strings are 23 characters or less.
smol_str_wrapper!(ProjectId);
// 90% of account strings are 23 characters or less.
smol_str_wrapper!(AccountId);
// will usually equal endpoint ID
smol_str_wrapper!(EndpointCacheKey);

View File

@@ -86,6 +86,8 @@ enum Command {
/// For safekeeper node_kind only, json list of timelines and their lsn info
#[arg(long, default_value = None)]
timeline_lsns: Option<String>,
#[arg(long, default_value_t = false)]
verbose: bool,
},
TenantSnapshot {
#[arg(long = "tenant-id")]
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
dump_db_connstr,
dump_db_table,
timeline_lsns,
verbose,
} => {
if let NodeKind::Safekeeper = node_kind {
let db_or_list = match (timeline_lsns, dump_db_connstr) {
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
tenant_ids,
json,
post_to_storcon,
verbose,
cli.exit_code,
)
.await
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
Vec::new(),
true,
post_to_storcon,
false, // default to non-verbose mode
exit_code,
)
.await?;
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
tenant_shard_ids: Vec<TenantShardId>,
json: bool,
post_to_storcon: bool,
verbose: bool,
exit_code: bool,
) -> anyhow::Result<()> {
if controller_client.is_none() && post_to_storcon {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)

View File

@@ -21,8 +21,12 @@ pub struct MetadataSummary {
tenant_count: usize,
timeline_count: usize,
timeline_shard_count: usize,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
/// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
/// The key is generated using `TenantShardTimelineId::to_string()`.
with_errors: HashMap<String, Vec<String>>,
/// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
/// The key is generated using `TenantShardTimelineId::to_string()`.
with_warnings: HashMap<String, Vec<String>>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
@@ -52,7 +56,12 @@ impl MetadataSummary {
}
}
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
fn update_analysis(
&mut self,
id: &TenantShardTimelineId,
analysis: &TimelineAnalysis,
verbose: bool,
) {
if analysis.is_healthy() {
self.healthy_tenant_shards.insert(id.tenant_shard_id);
} else {
@@ -61,11 +70,17 @@ impl MetadataSummary {
}
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
let entry = self.with_errors.entry(id.to_string()).or_default();
if verbose {
entry.extend(analysis.errors.iter().cloned());
}
}
if !analysis.warnings.is_empty() {
self.with_warnings.insert(*id);
let entry = self.with_warnings.entry(id.to_string()).or_default();
if verbose {
entry.extend(analysis.warnings.iter().cloned());
}
}
}
@@ -120,6 +135,7 @@ Index versions: {version_summary}
pub async fn scan_pageserver_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantShardId>,
verbose: bool,
) -> anyhow::Result<MetadataSummary> {
let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
mut tenant_objects: TenantObjectListing,
timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
highest_shard_count: ShardCount,
verbose: bool,
) {
summary.tenant_count += 1;
@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
Some(data),
)
.await;
summary.update_analysis(&ttid, &analysis);
summary.update_analysis(&ttid, &analysis, verbose);
timeline_ids.insert(ttid.timeline_id);
} else {
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
summary.update_data(&data);
match tenant_id {
None => {
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
Some(prev_tenant_id) => {
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
// New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
tenant_objects,
timelines,
highest_shard_count,
verbose,
)
.instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
.await;
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
}
None => {
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
}
match &data.blob_data {
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
tenant_objects,
tenant_timeline_results,
highest_shard_count,
verbose,
)
.instrument(info_span!("analyze-tenant", tenant = %tenant_id))
.await;

View File

@@ -4556,6 +4556,7 @@ class StorageScrubber:
def __init__(self, env: NeonEnv, log_dir: Path):
self.env = env
self.log_dir = log_dir
self.allowed_errors: list[str] = []
def scrubber_cli(
self, args: list[str], timeout, extra_env: dict[str, str] | None = None
@@ -4633,19 +4634,70 @@ class StorageScrubber:
if timeline_lsns is not None:
args.append("--timeline-lsns")
args.append(json.dumps(timeline_lsns))
if node_kind == NodeKind.PAGESERVER:
args.append("--verbose")
stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
try:
summary = json.loads(stdout)
# summary does not contain "with_warnings" if node_kind is the safekeeper
no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
healthy = not summary["with_errors"] and no_warnings
healthy = self._check_run_healthy(summary)
return healthy, summary
except:
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
log.error(stdout)
raise
def _check_line_allowed(self, line: str) -> bool:
for a in self.allowed_errors:
try:
if re.match(a, line):
return True
except re.error:
log.error(f"Invalid regex: '{a}'")
raise
return False
def _check_line_list_allowed(self, lines: list[str]) -> bool:
for line in lines:
if not self._check_line_allowed(line):
return False
return True
def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
# summary does not contain "with_warnings" if node_kind is the safekeeper
healthy = True
with_warnings = summary.get("with_warnings", None)
if with_warnings is not None:
if isinstance(with_warnings, list):
if len(with_warnings) > 0:
# safekeeper scan_metadata output is a list of tenants
healthy = False
else:
for _, warnings in with_warnings.items():
assert (
len(warnings) > 0
), "with_warnings value should not be empty, running without verbose mode?"
if not self._check_line_list_allowed(warnings):
healthy = False
break
if not healthy:
return healthy
with_errors = summary.get("with_errors", None)
if with_errors is not None:
if isinstance(with_errors, list):
if len(with_errors) > 0:
# safekeeper scan_metadata output is a list of tenants
healthy = False
else:
for _, errors in with_errors.items():
assert (
len(errors) > 0
), "with_errors value should not be empty, running without verbose mode?"
if not self._check_line_list_allowed(errors):
healthy = False
break
return healthy
def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
stdout = self.scrubber_cli(
["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],

View File

@@ -0,0 +1,124 @@
import threading
import time
from fixtures.neon_fixtures import NeonEnv
BTREE_NUM_CYCLEID_PAGES = """
WITH raw_pages AS (
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
),
parsed_pages AS (
/* cycle ID is the last 2 bytes of the btree page */
SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
FROM raw_pages
)
SELECT count(*),
encode(cycle_id, 'hex')
FROM parsed_pages
WHERE encode(cycle_id, 'hex') != '0000'
GROUP BY encode(cycle_id, 'hex');
"""
def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
env = neon_simple_env
endpoint = env.endpoints.create_start("main")
ses1 = endpoint.connect().cursor()
ses1.execute("ALTER SYSTEM SET autovacuum = off;")
ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
ses1.execute("SELECT pg_reload_conf();")
ses1.execute("CREATE EXTENSION neon_test_utils;")
# prepare a large index
ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
ses1.execute("SELECT neon_xlogflush();")
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()
assert (
len(pages) == 0
), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
# Delete enough tuples to clear the first index page.
# (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
ses1.execute("DELETE FROM t WHERE id <= 406;")
# Make sure the page is cleaned up
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
# Do another delete-then-indexcleanup cycle, to move the pages from
# "dead" to "reusable"
ses1.execute("DELETE FROM t WHERE id <= 446;")
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
# Make sure the vacuum we're about to trigger in s3 has cleanup work to do
ses1.execute("DELETE FROM t WHERE id <= 610;")
# Flush wal, for checking purposes
ses1.execute("SELECT neon_xlogflush();")
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
ses2 = endpoint.connect().cursor()
ses3 = endpoint.connect().cursor()
# Session 2 pins a btree page, which prevents vacuum from processing that
# page, thus allowing us to reliably split pages while a concurrent vacuum
# is running.
ses2.execute("BEGIN;")
ses2.execute(
"DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
)
ses2.execute("FETCH FROM foo;") # pins the leaf page with id 611
wait_evt = threading.Event()
# Session 3 runs the VACUUM command. Note that this will block, and
# therefore must run on another thread.
# We rely on this running quickly enough to hit the pinned page from
# session 2 by the time we start other work again in session 1, but
# technically there is a race where the thread (and/or PostgreSQL process)
# don't get to that pinned page with vacuum until >2s after evt.set() was
# called, and session 1 thus might already have split pages.
def vacuum_freeze_t(ses3, evt: threading.Event):
# Begin parallel vacuum that should hit the index
evt.set()
# this'll hang until s2 fetches enough new data from its cursor.
# this is technically a race with the time.sleep(2) below, but if this
# command doesn't hit
ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
ses3t.start()
wait_evt.wait()
# Make extra sure we got the thread started and vacuum is stuck, by waiting
# some time even after wait_evt got set. This isn't truly reliable (it is
# possible
time.sleep(2)
# Insert 2 pages worth of new data.
# This should reuse the one empty page, plus another page at the end of
# the index relation; with split ordering
# old_blk -> blkno=1 -> old_blk + 1.
# As this is run while vacuum in session 3 is happening, these splits
# should receive cycle IDs where applicable.
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
# unpin the btree page, allowing s3's vacuum to complete
ses2.execute("FETCH ALL FROM foo;")
ses2.execute("ROLLBACK;")
# flush WAL to make sure PS is up-to-date
ses1.execute("SELECT neon_xlogflush();")
# check that our expectations are correct
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()
assert (
len(pages) == 1 and pages[0][0] == 3
), f"3 page splits with cycle ID expected; actual {pages}"
# final cleanup
ses3t.join()
ses1.close()
ses2.close()
ses3.close()

View File

@@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata(
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
neon_env_builder.disable_scrub_on_exit()
healthy, _ = env.storage_scrubber.scan_metadata()
assert not healthy
env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*")
healthy, _ = env.storage_scrubber.scan_metadata()
assert healthy
neon_env_builder.disable_scrub_on_exit() # We already ran scrubber, no need to do an extra run

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.2",
"a10d95be67265e0f10a422ba0457f5a7af01de71"
"01fa3c48664ca030cfb69bb4a350aa9df4691d88"
],
"v16": [
"16.6",
"dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
"81428621f7c04aed03671cf80a928e0a36d92505"
],
"v15": [
"15.10",
"972e325e62b455957adbbdd8580e31275bb5b8c9"
"8736b10c1d93d11b9c0489872dd529c4c0f5338f"
],
"v14": [
"14.15",
"373f9decad933d2d46f321231032ae8b0da81acd"
"13ff324150fceaac72920e01742addc053db9462"
]
}