mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
storcon: soft disable SK heartbeats (#11041)
## Problem JWT tokens aren't in place, so all SK heartbeats fail. This is equivalent to a wait before applying the PS heartbeats and makes things more flaky. ## Summary of Changes Add a flag that skips loading SKs from the db on start-up and at runtime.
This commit is contained in:
@@ -165,6 +165,8 @@ pub struct NeonStorageControllerConf {
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub long_reconcile_threshold: Option<Duration>,
|
||||
|
||||
pub load_safekeepers: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -188,6 +190,7 @@ impl Default for NeonStorageControllerConf {
|
||||
max_secondary_lag_bytes: None,
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
long_reconcile_threshold: None,
|
||||
load_safekeepers: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -537,6 +537,10 @@ impl StorageController {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if self.config.load_safekeepers {
|
||||
args.push("--load-safekeepers".to_string());
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
|
||||
@@ -138,6 +138,10 @@ struct Cli {
|
||||
// Flag to use https for requests to pageserver API.
|
||||
#[arg(long, default_value = "false")]
|
||||
use_https_pageserver_api: bool,
|
||||
|
||||
/// Whether to load safekeeprs from the database and heartbeat them
|
||||
#[arg(long, default_value = "false")]
|
||||
load_safekeepers: bool,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
@@ -350,6 +354,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
start_as_candidate: args.start_as_candidate,
|
||||
http_service_port: args.listen.port() as i32,
|
||||
use_https_pageserver_api: args.use_https_pageserver_api,
|
||||
load_safekeepers: args.load_safekeepers,
|
||||
};
|
||||
|
||||
// Validate that we can connect to the database
|
||||
|
||||
@@ -389,6 +389,8 @@ pub struct Config {
|
||||
pub long_reconcile_threshold: Duration,
|
||||
|
||||
pub use_https_pageserver_api: bool,
|
||||
|
||||
pub load_safekeepers: bool,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
@@ -1405,15 +1407,20 @@ impl Service {
|
||||
.set(nodes.len() as i64);
|
||||
|
||||
tracing::info!("Loading safekeepers from database...");
|
||||
let safekeepers = persistence
|
||||
.list_safekeepers()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
|
||||
.collect::<Vec<_>>();
|
||||
let safekeepers = if config.load_safekeepers {
|
||||
persistence
|
||||
.list_safekeepers()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
tracing::info!("Skipping safekeeper loading");
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let safekeepers: HashMap<NodeId, Safekeeper> =
|
||||
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
|
||||
@@ -8054,7 +8061,8 @@ impl Service {
|
||||
) -> Result<(), DatabaseError> {
|
||||
let node_id = NodeId(record.id as u64);
|
||||
self.persistence.safekeeper_upsert(record.clone()).await?;
|
||||
{
|
||||
|
||||
if self.config.load_safekeepers {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let mut safekeepers = (*locked.safekeepers).clone();
|
||||
match safekeepers.entry(node_id) {
|
||||
@@ -8086,7 +8094,7 @@ impl Service {
|
||||
.await?;
|
||||
let node_id = NodeId(id as u64);
|
||||
// After the change has been persisted successfully, update the in-memory state
|
||||
{
|
||||
if self.config.load_safekeepers {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let mut safekeepers = (*locked.safekeepers).clone();
|
||||
let sk = safekeepers
|
||||
|
||||
@@ -1128,6 +1128,13 @@ class NeonEnv:
|
||||
if self.storage_controller_config is not None:
|
||||
cfg["storage_controller"] = self.storage_controller_config
|
||||
|
||||
# Disable new storcon flag in compat tests
|
||||
if config.test_may_use_compatibility_snapshot_binaries:
|
||||
if "storage_controller" in cfg:
|
||||
cfg["storage_controller"]["load_safekeepers"] = False
|
||||
else:
|
||||
cfg["storage_controller"] = {"load_safekeepers": False}
|
||||
|
||||
# Create config for pageserver
|
||||
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
|
||||
Reference in New Issue
Block a user