diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index d18b8d6885..0a7172bde2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -212,9 +212,9 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, - /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes - /// loading such tenants, vs. other work in the system. + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. + /// + /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 5afb3ba63d..19b5fb7e79 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -579,6 +579,12 @@ paths: required: false schema: type: integer + - name: lazy + in: query + required: false + schema: + type: boolean + description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 12bd21fd7b..9d92fbaee0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -816,13 +816,7 @@ async fn tenant_attach_handler( let tenant = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - None, - SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) .await?; let Some(tenant) = tenant else { @@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler( let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); + let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; + // lazy==true queues up for activation or jumps the queue like normal when a compute connects, + // similar to at startup ordering. + let spawn_mode = if lazy { + tenant::SpawnMode::Lazy + } else { + tenant::SpawnMode::Eager + }; + let attached = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - flush, - tenant::SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) .await? .is_some(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6a63a2adeb..f027e9d4b1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -227,7 +227,11 @@ pub(crate) struct TenantPreload { /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { - Normal, + /// Activate as soon as possible + Eager, + /// Lazy activation in the background, with the option to skip the queue if the need comes up + Lazy, + /// Tenant has been created during the lifetime of this process Create, } @@ -700,41 +704,37 @@ impl Tenant { .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { - // During pageserver startup, we are attaching this tenant lazily in the background - Warmup(tokio::sync::SemaphorePermit<'a>), - // During pageserver startup, we are attaching this tenant as soon as we can, - // because a client tried to access it. + /// We are attaching this tenant lazily in the background. + Warmup { + _permit: tokio::sync::SemaphorePermit<'a>, + during_startup: bool + }, + /// We are attaching this tenant as soon as we can, because for example an + /// endpoint tried to access it. OnDemand, - // During normal operations after startup, we are attaching a tenant. + /// During normal operations after startup, we are attaching a tenant, and + /// eager attach was requested. Normal, } - // Before doing any I/O, wait for either or: - // - A client to attempt to access to this tenant (on-demand loading) - // - A permit to become available in the warmup semaphore (background warmup) - // - // Some-ness of init_order is how we know if we're attaching during startup or later - // in process lifetime. - let attach_type = if init_order.is_some() { + let attach_type = if matches!(mode, SpawnMode::Lazy) { + // Before doing any I/O, wait for at least one of: + // - A client attempting to access to this tenant (on-demand loading) + // - A permit becoming available in the warmup semaphore (background warmup) + tokio::select!( - _ = tenant_clone.activate_now_sem.acquire() => { + permit = tenant_clone.activate_now_sem.acquire() => { + let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, - permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { - match permit_result { - Ok(p) => { - tracing::info!("Activating tenant (warmup)"); - AttachType::Warmup(p) - } - Err(_) => { - // This is unexpected: the warmup semaphore should stay alive - // for the lifetime of init_order. Log a warning and proceed. - tracing::warn!("warmup_limit semaphore unexpectedly closed"); - AttachType::Normal - } + permit = conf.concurrent_tenant_warmup.inner().acquire() => { + let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup { + _permit, + during_startup: init_order.is_some() } - } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant @@ -749,6 +749,8 @@ impl Tenant { }, ) } else { + // SpawnMode::{Create,Eager} always cause jumping ahead of the + // concurrent_tenant_warmup queue AttachType::Normal }; @@ -756,7 +758,7 @@ impl Tenant { (SpawnMode::Create, _) => { None }, - (SpawnMode::Normal, Some(remote_storage)) => { + (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone .preload(remote_storage, task_mgr::shutdown_token()) @@ -769,7 +771,7 @@ impl Tenant { } } } - (SpawnMode::Normal, None) => { + (_, None) => { let _preload_timer = TENANT.preload.start_timer(); None } @@ -828,7 +830,7 @@ impl Tenant { let attached = { let _attach_timer = match mode { SpawnMode::Create => None, - SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), }; tenant_clone.attach(preload, mode, &ctx).await }; @@ -850,7 +852,7 @@ impl Tenant { // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. - if let AttachType::Warmup(_permit) = attach_type { + if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); while futs.next().await.is_some() {} @@ -923,7 +925,7 @@ impl Tenant { deleting: false, timelines: HashMap::new(), }, - (None, SpawnMode::Normal) => { + (None, _) => { anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); } }; @@ -3769,7 +3771,7 @@ pub(crate) mod harness { let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) .await?; - tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?; + tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 3d138da7af..ffb7206b1e 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -420,7 +420,7 @@ impl DeleteTenantFlow { .expect("cant be stopping or broken"); tenant - .attach(preload, super::SpawnMode::Normal, ctx) + .attach(preload, super::SpawnMode::Eager, ctx) .await .context("attach")?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 8f0f73d4b5..805d44f93d 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -595,7 +595,7 @@ pub async fn init_tenant_mgr( shard_identity, Some(init_order.clone()), &TENANTS, - SpawnMode::Normal, + SpawnMode::Lazy, &ctx, ) { Ok(tenant) => { @@ -1106,9 +1106,9 @@ impl TenantManager { // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then // the caller thinks they're creating but the tenant already existed. We must switch to - // Normal mode so that when starting this Tenant we properly probe remote storage for timelines, + // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, // rather than assuming it to be empty. - spawn_mode = SpawnMode::Normal; + spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1300,7 +1300,7 @@ impl TenantManager { shard_identity, None, self.tenants, - SpawnMode::Normal, + SpawnMode::Eager, ctx, )?; @@ -1521,7 +1521,7 @@ impl TenantManager { *child_shard, child_location_conf, None, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .await?; @@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant( shard_identity, None, &TENANTS, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index ad3efb5837..b8e20c451f 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) def tenant_location_conf( - self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None + self, + tenant_id: Union[TenantId, TenantShardId], + location_conf=dict[str, Any], + flush_ms=None, + lazy: Optional[bool] = None, ): body = location_conf.copy() body["tenant_id"] = str(tenant_id) @@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if flush_ms is not None: params["flush_ms"] = str(flush_ms) + if lazy is not None: + params["lazy"] = "true" if lazy else "false" + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", json=body, diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 327e5abe26..cbf7059c92 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -2,6 +2,7 @@ import concurrent.futures import math import random import time +from collections import defaultdict from contextlib import closing from pathlib import Path from typing import Optional @@ -14,6 +15,7 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, VanillaPostgres, wait_for_last_flush_lsn, @@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) # Deleting a stuck tenant should prompt it to go active + # in some cases, it has already been activated because it's behind the detach + delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one + # we detached) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 2 + + +def delete_lazy_activating( + delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool +): + pageserver_http = pageserver.http_client() + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" + + if expect_attaching: + assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" + with concurrent.futures.ThreadPoolExecutor() as executor: log.info("Starting background delete") + def activated_on_demand(): + assert pageserver.log_contains(log_match) is not None + def delete_tenant(): - env.pageserver.http_client().tenant_delete(delete_tenant_id) + pageserver_http.tenant_delete(delete_tenant_id) background_delete = executor.submit(delete_tenant) - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" - - def activated_on_demand(): - assert env.pageserver.log_contains(log_match) is not None - log.info(f"Waiting for activation message '{log_match}'") try: wait_until(10, 1, activated_on_demand) @@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Poll for deletion to complete wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - tenant_ids.remove(delete_tenant_id) - - # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one - # we detached) - wait_until(10, 1, all_active) - assert len(get_tenant_states()) == n_tenants - 2 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): @@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): client.configure_failpoints( [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] ) + + +def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # the supporting_second does nothing except queue behind env.initial_tenant + # for purposes of showing that eager_tenant breezes past the queue + supporting_second, _ = env.neon_cli.create_tenant() + eager_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + eager_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + tenant_ids = [env.initial_tenant, supporting_second] + + def get_tenant_states() -> dict[str, list[TenantId]]: + states = defaultdict(list) + for id in tenant_ids: + state = client.tenant_status(id)["state"]["slug"] + states[state].append(id) + return dict(states) + + def one_is_active(): + states = get_tenant_states() + log.info(f"{states}") + assert len(states["Active"]) == 1 + + wait_until(10, 1, one_is_active) + + def other_is_attaching(): + states = get_tenant_states() + assert len(states["Attaching"]) == 1 + + wait_until(10, 1, other_is_attaching) + + def eager_tenant_is_active(): + resp = client.tenant_status(eager_tenant) + assert resp["state"]["slug"] == "Active" + + gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id) + client.tenant_location_conf( + eager_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=False, + ) + wait_until(10, 1, eager_tenant_is_active) + + other_is_attaching() + + client.configure_failpoints( + [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) +def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): + # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue + lazy_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + lazy_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + def initial_tenant_is_active(): + resp = client.tenant_status(env.initial_tenant) + assert resp["state"]["slug"] == "Active" + + wait_until(10, 1, initial_tenant_is_active) + + # even though the initial tenant is now active, because it was startup time + # attach, it will consume the only permit because logical size calculation + # is paused. + + gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id) + client.tenant_location_conf( + lazy_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=True, + ) + + def lazy_tenant_is_attaching(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Attaching" + + # paused logical size calculation of env.initial_tenant is keeping it attaching + wait_until(10, 1, lazy_tenant_is_attaching) + + for _ in range(5): + lazy_tenant_is_attaching() + time.sleep(0.5) + + def lazy_tenant_is_active(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Active" + + if activation_method == "endpoint": + with env.endpoints.create_start("main", tenant_id=lazy_tenant): + # starting up the endpoint should make it jump the queue + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "branch": + env.neon_cli.create_timeline("second_branch", lazy_tenant) + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "delete": + delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) + else: + raise RuntimeError(activation_method)