feat: support lazy, queued tenant attaches (#6907)

Add off-by-default support for lazy queued tenant activation on attach.
This should be useful on bulk migrations as some tenants will be
activated faster due to operations or endpoint startup. Eventually all
tenants will get activated by reusing the same mechanism we have at
startup (`PageserverConf::concurrent_tenant_warmup`).

The difference to lazy attached tenants to startup ones is that we leave
their initial logical size calculation be triggered by WalReceiver or
consumption metrics.

Fixes: #6315

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
Joonas Koivunen
2024-02-29 13:26:29 +02:00
committed by GitHub
parent d04af08567
commit 4d426f6fbe
8 changed files with 255 additions and 73 deletions

View File

@@ -212,9 +212,9 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
/// loading such tenants, vs. other work in the system.
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
///
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.

View File

@@ -579,6 +579,12 @@ paths:
required: false
schema:
type: integer
- name: lazy
in: query
required: false
schema:
type: boolean
description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
put:
description: |
Configures a _tenant location_, that is how a particular pageserver handles

View File

@@ -816,13 +816,7 @@ async fn tenant_attach_handler(
let tenant = state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
None,
SpawnMode::Normal,
&ctx,
)
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
.await?;
let Some(tenant) = tenant else {
@@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler(
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler(
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
// lazy==true queues up for activation or jumps the queue like normal when a compute connects,
// similar to at startup ordering.
let spawn_mode = if lazy {
tenant::SpawnMode::Lazy
} else {
tenant::SpawnMode::Eager
};
let attached = state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
flush,
tenant::SpawnMode::Normal,
&ctx,
)
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
.await?
.is_some();

View File

@@ -227,7 +227,11 @@ pub(crate) struct TenantPreload {
/// When we spawn a tenant, there is a special mode for tenant creation that
/// avoids trying to read anything from remote storage.
pub(crate) enum SpawnMode {
Normal,
/// Activate as soon as possible
Eager,
/// Lazy activation in the background, with the option to skip the queue if the need comes up
Lazy,
/// Tenant has been created during the lifetime of this process
Create,
}
@@ -700,41 +704,37 @@ impl Tenant {
.and_then(|x| x.initial_tenant_load_remote.take());
enum AttachType<'a> {
// During pageserver startup, we are attaching this tenant lazily in the background
Warmup(tokio::sync::SemaphorePermit<'a>),
// During pageserver startup, we are attaching this tenant as soon as we can,
// because a client tried to access it.
/// We are attaching this tenant lazily in the background.
Warmup {
_permit: tokio::sync::SemaphorePermit<'a>,
during_startup: bool
},
/// We are attaching this tenant as soon as we can, because for example an
/// endpoint tried to access it.
OnDemand,
// During normal operations after startup, we are attaching a tenant.
/// During normal operations after startup, we are attaching a tenant, and
/// eager attach was requested.
Normal,
}
// Before doing any I/O, wait for either or:
// - A client to attempt to access to this tenant (on-demand loading)
// - A permit to become available in the warmup semaphore (background warmup)
//
// Some-ness of init_order is how we know if we're attaching during startup or later
// in process lifetime.
let attach_type = if init_order.is_some() {
let attach_type = if matches!(mode, SpawnMode::Lazy) {
// Before doing any I/O, wait for at least one of:
// - A client attempting to access to this tenant (on-demand loading)
// - A permit becoming available in the warmup semaphore (background warmup)
tokio::select!(
_ = tenant_clone.activate_now_sem.acquire() => {
permit = tenant_clone.activate_now_sem.acquire() => {
let _ = permit.expect("activate_now_sem is never closed");
tracing::info!("Activating tenant (on-demand)");
AttachType::OnDemand
},
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
match permit_result {
Ok(p) => {
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup(p)
}
Err(_) => {
// This is unexpected: the warmup semaphore should stay alive
// for the lifetime of init_order. Log a warning and proceed.
tracing::warn!("warmup_limit semaphore unexpectedly closed");
AttachType::Normal
}
permit = conf.concurrent_tenant_warmup.inner().acquire() => {
let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup {
_permit,
during_startup: init_order.is_some()
}
}
_ = tenant_clone.cancel.cancelled() => {
// This is safe, but should be pretty rare: it is interesting if a tenant
@@ -749,6 +749,8 @@ impl Tenant {
},
)
} else {
// SpawnMode::{Create,Eager} always cause jumping ahead of the
// concurrent_tenant_warmup queue
AttachType::Normal
};
@@ -756,7 +758,7 @@ impl Tenant {
(SpawnMode::Create, _) => {
None
},
(SpawnMode::Normal, Some(remote_storage)) => {
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
let _preload_timer = TENANT.preload.start_timer();
let res = tenant_clone
.preload(remote_storage, task_mgr::shutdown_token())
@@ -769,7 +771,7 @@ impl Tenant {
}
}
}
(SpawnMode::Normal, None) => {
(_, None) => {
let _preload_timer = TENANT.preload.start_timer();
None
}
@@ -828,7 +830,7 @@ impl Tenant {
let attached = {
let _attach_timer = match mode {
SpawnMode::Create => None,
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
};
tenant_clone.attach(preload, mode, &ctx).await
};
@@ -850,7 +852,7 @@ impl Tenant {
// It also prevents the warmup proccess competing with the concurrency limit on
// logical size calculations: if logical size calculation semaphore is saturated,
// then warmup will wait for that before proceeding to the next tenant.
if let AttachType::Warmup(_permit) = attach_type {
if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
tracing::info!("Waiting for initial logical sizes while warming up...");
while futs.next().await.is_some() {}
@@ -923,7 +925,7 @@ impl Tenant {
deleting: false,
timelines: HashMap::new(),
},
(None, SpawnMode::Normal) => {
(None, _) => {
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
}
};
@@ -3769,7 +3771,7 @@ pub(crate) mod harness {
let preload = tenant
.preload(&self.remote_storage, CancellationToken::new())
.await?;
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
tenant.state.send_replace(TenantState::Active);
for timeline in tenant.timelines.lock().unwrap().values() {

View File

@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
.expect("cant be stopping or broken");
tenant
.attach(preload, super::SpawnMode::Normal, ctx)
.attach(preload, super::SpawnMode::Eager, ctx)
.await
.context("attach")?;

View File

@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Normal,
SpawnMode::Lazy,
&ctx,
) {
Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
// the caller thinks they're creating but the tenant already existed. We must switch to
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
// Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Normal;
spawn_mode = SpawnMode::Eager;
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
shard_identity,
None,
self.tenants,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)?;
@@ -1521,7 +1521,7 @@ impl TenantManager {
*child_shard,
child_location_conf,
None,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)
.await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
shard_identity,
None,
&TENANTS,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;