safekeeper: add eviction_min_resident to stop evictions thrashing (#8335)

## Problem

- The condition for eviction is not time-based: it is possible for a
timeline to be restored in response to a client, that client times out,
and then as soon as the timeline is restored it is immediately evicted
again.
- There is no delay on eviction at startup of the safekeeper, so when it
starts up and sees many idle timelines, it does many evictions which
will likely be immediately restored when someone uses the timeline.

## Summary of changes

- Add `eviction_min_resident` parameter, and use it in
`ready_for_eviction` to avoid evictions if the timeline has been
resident for less than this period.
- This also implicitly delays evictions at startup for
`eviction_min_resident`
- Set this to a very low number for the existing eviction test, which
expects immediate eviction.

The default period is 15 minutes. The general reasoning for that is that
in the worst case where we thrash ~10k timelines on one safekeeper,
downloading 16MB for each one, we should set a period that would not
overwhelm the node's bandwidth.
This commit is contained in:
John Spray
2024-07-10 19:38:14 +01:00
committed by Alex Chi Z
parent bea6532881
commit 547acde6cd
7 changed files with 56 additions and 14 deletions

View File

@@ -27,8 +27,8 @@ use utils::pid_file;
use metrics::set_build_info_metric;
use safekeeper::defaults::{
DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::http;
@@ -194,6 +194,12 @@ struct Args {
/// Number of allowed concurrent uploads of partial segments to remote storage.
#[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
partial_backup_concurrency: usize,
/// How long a timeline must be resident before it is eligible for eviction.
/// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
/// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
/// if it weren't for `eviction_min_resident` preventing that.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration,
}
// Like PathBufValueParser, but allows empty string.
@@ -348,6 +354,7 @@ async fn main() -> anyhow::Result<()> {
delete_offloaded_wal: args.delete_offloaded_wal,
control_file_save_interval: args.control_file_save_interval,
partial_backup_concurrency: args.partial_backup_concurrency,
eviction_min_resident: args.eviction_min_resident,
};
// initialize sentry if SENTRY_DSN is provided

View File

@@ -53,6 +53,11 @@ pub mod defaults {
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
// By default, our required residency before eviction is the same as the period that passes
// before uploading a partial segment, so that in normal operation the eviction can happen
// as soon as we have done the partial segment upload.
pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT;
}
#[derive(Debug, Clone)]
@@ -93,6 +98,7 @@ pub struct SafeKeeperConf {
pub delete_offloaded_wal: bool,
pub control_file_save_interval: Duration,
pub partial_backup_concurrency: usize,
pub eviction_min_resident: Duration,
}
impl SafeKeeperConf {
@@ -136,6 +142,7 @@ impl SafeKeeperConf {
delete_offloaded_wal: false,
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
}
}
}

View File

@@ -5,6 +5,7 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use remote_storage::RemotePath;
use std::time::Instant;
use tokio::{
fs::File,
io::{AsyncRead, AsyncWriteExt},
@@ -48,6 +49,7 @@ impl Manager {
.flush_lsn
.segment_number(self.wal_seg_size)
== self.last_removed_segno + 1
&& self.resident_since.elapsed() >= self.conf.eviction_min_resident
}
/// Evict the timeline to remote storage.
@@ -91,6 +93,8 @@ impl Manager {
return;
}
self.resident_since = Instant::now();
info!("successfully restored evicted timeline");
}
}

View File

@@ -186,6 +186,10 @@ pub(crate) struct Manager {
// misc
pub(crate) access_service: AccessService,
pub(crate) partial_backup_rate_limiter: RateLimiter,
// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
// evict them if they go inactive very soon after being restored.
pub(crate) resident_since: std::time::Instant,
}
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -350,6 +354,7 @@ impl Manager {
access_service: AccessService::new(manager_tx),
tli,
partial_backup_rate_limiter,
resident_since: std::time::Instant::now(),
}
}

View File

@@ -188,6 +188,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
delete_offloaded_wal: false,
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
};
let mut global = GlobalMap::new(disk, conf.clone())?;

View File

@@ -492,6 +492,7 @@ class NeonEnvBuilder:
pageserver_virtual_file_io_engine: Optional[str] = None,
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
safekeeper_extra_opts: Optional[list[str]] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -557,6 +558,8 @@ class NeonEnvBuilder:
self.pageserver_aux_file_policy = pageserver_aux_file_policy
self.safekeeper_extra_opts = safekeeper_extra_opts
assert test_name.startswith(
"test_"
), "Unexpectedly instantiated from outside a test function"
@@ -1193,7 +1196,9 @@ class NeonEnv:
sk_cfg[
"remote_storage"
] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
self.safekeepers.append(
Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
)
cfg["safekeepers"].append(sk_cfg)
log.info(f"Config: {cfg}")
@@ -4016,16 +4021,28 @@ class Safekeeper(LogUtils):
id: int
running: bool = False
def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
def __init__(
self,
env: NeonEnv,
port: SafekeeperPort,
id: int,
running: bool = False,
extra_opts: Optional[List[str]] = None,
):
self.env = env
self.port = port
self.id = id
self.running = running
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
self.extra_opts = extra_opts
def start(
self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
) -> "Safekeeper":
if extra_opts is None:
# Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
extra_opts = self.extra_opts
assert self.running is False
self.env.neon_cli.safekeeper_start(
self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds

View File

@@ -2191,24 +2191,25 @@ def test_s3_eviction(
):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)
extra_opts = [
neon_env_builder.safekeeper_extra_opts = [
"--enable-offload",
"--partial-backup-timeout",
"50ms",
"--control-file-save-interval",
"1s",
# Safekeepers usually wait a while before evicting something: for this test we want them to
# evict things as soon as they are inactive.
"--eviction-min-resident=100ms",
]
if delete_offloaded_wal:
extra_opts.append("--delete-offloaded-wal")
neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
for sk in env.safekeepers:
sk.stop().start(extra_opts=extra_opts)
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)
n_timelines = 5
@@ -2263,7 +2264,7 @@ def test_s3_eviction(
# restarting random safekeepers
for sk in env.safekeepers:
if random.random() < restart_chance:
sk.stop().start(extra_opts=extra_opts)
sk.stop().start()
time.sleep(0.5)
# require at least one successful eviction in at least one safekeeper