Compare commits

...

1 Commits

Author SHA1 Message Date
Arseny Sher
b669e21205 safekeeper: make --partial-backup-timeout respect min.
We added jitter to --partial-backup-timeout and --eviction-min-resident
values by taking rand of them. To respect the min value semantics of
these use n + rand(n) instead.
2024-11-12 11:04:35 +03:00
4 changed files with 19 additions and 11 deletions

View File

@@ -171,6 +171,7 @@ struct Args {
#[arg(long)]
walsenders_keep_horizon: bool,
/// Controls how long backup will wait until uploading the partial segment.
/// The actual waiting time is passed value + rand of it to add some jitter.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
partial_backup_timeout: Duration,
/// Disable task to push messages to broker every second. Supposed to
@@ -195,6 +196,7 @@ struct Args {
/// if it weren't for `eviction_min_resident` preventing that.
///
/// Also defines interval for eviction retries.
/// The actual waiting time is passed value + rand of it to add some jitter.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration,
}

View File

@@ -18,7 +18,6 @@ use crate::{
metrics::{
EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
},
rate_limit::rand_duration,
timeline_manager::{Manager, StateSnapshot},
wal_backup,
wal_backup_partial::{self, PartialRemoteSegment},
@@ -131,8 +130,7 @@ impl Manager {
return;
}
self.evict_not_before =
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
self.update_evict_not_before();
info!("successfully restored evicted timeline");
NUM_EVICTED_TIMELINES.dec();

View File

@@ -315,15 +315,13 @@ pub async fn main_task(
mgr.set_status(Status::EvictTimeline);
if !mgr.evict_timeline().await {
// eviction failed, try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
mgr.update_evict_not_before();
update_next_event(&mut next_event, mgr.evict_not_before);
}
}
None => {
// we can't evict timeline now, will try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
mgr.update_evict_not_before();
update_next_event(&mut next_event, mgr.evict_not_before);
}
}
@@ -430,7 +428,9 @@ impl Manager {
tli,
global_rate_limiter,
// to smooth out evictions spike after restart
evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
evict_not_before: Instant::now()
+ conf.eviction_min_resident
+ rand_duration(&conf.eviction_min_resident),
conf,
}
}
@@ -720,6 +720,14 @@ impl Manager {
}
}
}
pub fn update_evict_not_before(&mut self) {
// Wait at least eviction_min_resident and add randomized value of it to
// add jitter.
self.evict_not_before = Instant::now()
+ self.conf.eviction_min_resident
+ rand_duration(&self.conf.eviction_min_resident)
}
}
// utility functions

View File

@@ -426,12 +426,12 @@ pub async fn main_task(
cancel: CancellationToken,
) -> Option<PartialRemoteSegment> {
debug!("started");
let await_duration = conf.partial_backup_timeout;
let mut first_iteration = true;
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
let partial_backup_timeout = conf.partial_backup_timeout;
let mut backup = PartialBackup::new(tli, conf).await;
debug!("state: {:?}", backup.state);
@@ -489,9 +489,9 @@ pub async fn main_task(
// if this is not the first iteration, we will wait for the full await_duration
let await_duration = if first_iteration {
first_iteration = false;
rand_duration(&await_duration)
partial_backup_timeout + rand_duration(&partial_backup_timeout)
} else {
await_duration
partial_backup_timeout
};
// fixing the segno and waiting some time to prevent reuploading the same segment too often