From baa5fa1e77b2194ec666c9beeeeec14fc330c920 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 5 Oct 2023 09:55:10 +0100 Subject: [PATCH] pageserver: location configuration API, attachment modes, secondary locations (#5299) ## Problem These changes are part of building seamless tenant migration, as described in the RFC: - https://github.com/neondatabase/neon/pull/5029 ## Summary of changes - A new configuration type `LocationConf` supersedes `TenantConfOpt` for storing a tenant's configuration in the pageserver repo dir. It contains `TenantConfOpt`, as well as a new `mode` attribute that describes what kind of location this is (secondary, attached, attachment mode etc). It is written to a file called `config-v1` instead of `config` -- this prepares us for neatly making any other profound changes to the format of the file in future. Forward compat for existing pageserver code is achieved by writing out both old and new style files. Backward compat is achieved by checking for the old-style file if the new one isn't found. - The `TenantMap` type changes, to hold `TenantSlot` instead of just `Tenant`. The `Tenant` type continues to be used for attached tenants only. Tenants in other states (such as secondaries) are represented by a different variant of `TenantSlot`. - Where `Tenant` & `Timeline` used to hold an Arc>, they now hold a reference to a AttachedTenantConf, which includes the extra information from LocationConf. This enables them to know the current attachment mode. - The attachment mode is used as an advisory input to decide whether to do compaction and GC (AttachedStale is meant to avoid doing uploads, AttachedMulti is meant to avoid doing deletions). - A new HTTP API is added at `PUT /tenants//location_config` to drive new location configuration. This provides a superset of the functionality of attach/detach/load/ignore: - Attaching a tenant is just configuring it in an attached state - Detaching a tenant is configuring it to a detached state - Loading a tenant is just the same as attaching it - Ignoring a tenant is the same as configuring it into Secondary with warm=false (i.e. retain the files on disk but do nothing else). Caveats: - AttachedMulti tenants don't do compaction in this PR, but they do in the follow on #5397 - Concurrent updates to the `location_config` API are not handled elegantly in this PR, a better mechanism is added in the follow on https://github.com/neondatabase/neon/pull/5367 - Secondary mode is just a placeholder in this PR: the code to upload heatmaps and do downloads on secondary locations will be added in a later PR (but that shouldn't change any external interfaces) Closes: https://github.com/neondatabase/neon/issues/5379 --------- Co-authored-by: Christian Schwarz --- libs/pageserver_api/src/models.rs | 46 ++++ pageserver/src/config.rs | 12 +- pageserver/src/http/routes.rs | 55 ++++- pageserver/src/lib.rs | 4 + pageserver/src/tenant.rs | 282 +++++++++++++++------- pageserver/src/tenant/config.rs | 207 ++++++++++++++++- pageserver/src/tenant/delete.rs | 1 + pageserver/src/tenant/mgr.rs | 372 ++++++++++++++++++++++++++---- pageserver/src/tenant/timeline.rs | 30 ++- 9 files changed, 866 insertions(+), 143 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 68620787bb..b2064cb7fe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -10,6 +10,7 @@ use serde_with::{serde_as, DisplayFromStr}; use strum_macros; use utils::{ completion, + generation::Generation, history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, @@ -218,6 +219,8 @@ impl std::ops::Deref for TenantCreateRequest { } } +/// An alternative representation of `pageserver::tenant::TenantConf` with +/// simpler types. #[derive(Serialize, Deserialize, Debug, Default)] pub struct TenantConfig { pub checkpoint_distance: Option, @@ -243,6 +246,39 @@ pub struct TenantConfig { pub gc_feedback: Option, } +/// A flattened analog of a `pagesever::tenant::LocationMode`, which +/// lists out all possible states (and the virtual "Detached" state) +/// in a flat form rather than using rust-style enums. +#[derive(Serialize, Deserialize, Debug)] +pub enum LocationConfigMode { + AttachedSingle, + AttachedMulti, + AttachedStale, + Secondary, + Detached, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct LocationConfigSecondary { + pub warm: bool, +} + +/// An alternative representation of `pageserver::tenant::LocationConf`, +/// for use in external-facing APIs. +#[derive(Serialize, Deserialize, Debug)] +pub struct LocationConfig { + pub mode: LocationConfigMode, + /// If attaching, in what generation? + #[serde(default)] + pub generation: Option, + #[serde(default)] + pub secondary_conf: Option, + + // If requesting mode `Secondary`, configuration for that. + // Custom storage configuration for the tenant, if any + pub tenant_conf: TenantConfig, +} + #[serde_as] #[derive(Serialize, Deserialize)] #[serde(transparent)] @@ -253,6 +289,16 @@ pub struct StatusResponse { pub id: NodeId, } +#[serde_as] +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLocationConfigRequest { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde(flatten)] + pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it +} + #[serde_as] #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 333d12a544..1460929e3a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -37,8 +37,8 @@ use crate::tenant::{ TIMELINES_SEGMENT_NAME, }; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, - TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME, + TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; pub mod defaults { @@ -631,10 +631,18 @@ impl PageServerConf { /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. + /// + /// Legacy: superseded by tenant_location_config_path. Eventually + /// remove this function. pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME) } + pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { + self.tenant_path(tenant_id) + .join(TENANT_LOCATION_CONFIG_NAME) + } + pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4144eef018..f09e09c5a0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -10,7 +10,8 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest, + DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, + TenantLoadRequest, TenantLocationConfigRequest, }; use remote_storage::GenericRemoteStorage; use tenant_size_model::{SizeResult, StorageModel}; @@ -29,7 +30,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::{ GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError, }; @@ -150,7 +151,10 @@ impl From for ApiError { TenantMapInsertError::TenantAlreadyExists(id, state) => { ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) } - TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), + TenantMapInsertError::TenantExistsSecondary(id) => { + ApiError::Conflict(format!("tenant {id} already exists as secondary")) + } + TenantMapInsertError::Other(e) => ApiError::InternalServerError(e), } } } @@ -1011,6 +1015,48 @@ async fn update_tenant_config_handler( json_response(StatusCode::OK, ()) } +async fn put_tenant_location_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; + let tenant_id = request_data.tenant_id; + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let state = get_state(&request); + let conf = state.conf; + + // The `Detached` state is special, it doesn't upsert a tenant, it removes + // its local disk content and drops it from memory. + if let LocationConfigMode::Detached = request_data.config.mode { + mgr::detach_tenant(conf, tenant_id, true) + .instrument(info_span!("tenant_detach", %tenant_id)) + .await?; + return json_response(StatusCode::OK, ()); + } + + let location_conf = + LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; + + mgr::upsert_location( + state.conf, + tenant_id, + location_conf, + state.broker_client.clone(), + state.remote_storage.clone(), + state.deletion_queue_client.clone(), + &ctx, + ) + .await + // TODO: badrequest assumes the caller was asking for something unreasonable, but in + // principle we might have hit something like concurrent API calls to the same tenant, + // which is not a 400 but a 409. + .map_err(ApiError::BadRequest)?; + + json_response(StatusCode::OK, ()) +} + /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. async fn handle_tenant_break( r: Request, @@ -1464,6 +1510,9 @@ pub fn make_router( .get("/v1/tenant/:tenant_id/config", |r| { api_handler(r, get_tenant_config_handler) }) + .put("/v1/tenant/:tenant_id/location_config", |r| { + api_handler(r, put_tenant_location_config_handler) + }) .get("/v1/tenant/:tenant_id/timeline", |r| { api_handler(r, timeline_list_handler) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 5f62c164bb..8199cd38e6 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -112,6 +112,10 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// Full path: `tenants//config`. pub const TENANT_CONFIG_NAME: &str = "config"; +/// Per-tenant configuration file. +/// Full path: `tenants//config`. +pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; + /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. pub const TEMP_FILE_SUFFIX: &str = "___temp"; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f962549ffe..17e17d8d20 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -44,6 +44,8 @@ use std::sync::MutexGuard; use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; +use self::config::AttachedLocationConfig; +use self::config::LocationConf; use self::config::TenantConf; use self::delete::DeleteTenantFlow; use self::metadata::LoadMetadataError; @@ -64,6 +66,7 @@ use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETI use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::config::LocationMode; use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; pub use crate::tenant::remote_timeline_client::index::IndexPart; @@ -160,6 +163,28 @@ pub struct TenantSharedResources { pub deletion_queue_client: DeletionQueueClient, } +/// A [`Tenant`] is really an _attached_ tenant. The configuration +/// for an attached tenant is a subset of the [`LocationConf`], represented +/// in this struct. +pub(super) struct AttachedTenantConf { + tenant_conf: TenantConfOpt, + location: AttachedLocationConfig, +} + +impl AttachedTenantConf { + fn try_from(location_conf: LocationConf) -> anyhow::Result { + match &location_conf.mode { + LocationMode::Attached(attach_conf) => Ok(Self { + tenant_conf: location_conf.tenant_conf, + location: attach_conf.clone(), + }), + LocationMode::Secondary(_) => { + anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") + } + } + } +} + /// /// Tenant consists of multiple timelines. Keep them in a hash table. /// @@ -177,12 +202,15 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_id: TenantId, /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`Tenant`] object. + /// + /// This duplicates the generation stored in LocationConf, but that structure is mutable: + /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. generation: Generation, timelines: Mutex>>, @@ -526,14 +554,13 @@ impl Tenant { pub(crate) fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, - generation: Generation, resources: TenantSharedResources, + attached_conf: AttachedTenantConf, tenants: &'static tokio::sync::RwLock, ctx: &RequestContext, ) -> anyhow::Result> { // TODO dedup with spawn_load - let tenant_conf = - Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let TenantSharedResources { broker_client, @@ -541,14 +568,12 @@ impl Tenant { deletion_queue_client, } = resources; - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( TenantState::Attaching, conf, - tenant_conf, + attached_conf, wal_redo_manager, tenant_id, - generation, remote_storage.clone(), deletion_queue_client, )); @@ -859,10 +884,9 @@ impl Tenant { backtrace: String::new(), }, conf, - TenantConfOpt::default(), + AttachedTenantConf::try_from(LocationConf::default()).unwrap(), wal_redo_manager, tenant_id, - Generation::broken(), None, DeletionQueueClient::broken(), )) @@ -881,7 +905,7 @@ impl Tenant { pub(crate) fn spawn_load( conf: &'static PageServerConf, tenant_id: TenantId, - generation: Generation, + attached_conf: AttachedTenantConf, resources: TenantSharedResources, init_order: Option, tenants: &'static tokio::sync::RwLock, @@ -889,14 +913,6 @@ impl Tenant { ) -> Arc { span::debug_assert_current_span_has_tenant_id(); - let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) { - Ok(conf) => conf, - Err(e) => { - error!("load tenant config failed: {:?}", e); - return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")); - } - }; - let broker_client = resources.broker_client; let remote_storage = resources.remote_storage; @@ -904,10 +920,9 @@ impl Tenant { let tenant = Tenant::new( TenantState::Loading, conf, - tenant_conf, + attached_conf, wal_redo_manager, tenant_id, - generation, remote_storage.clone(), resources.deletion_queue_client.clone(), ); @@ -1646,6 +1661,15 @@ impl Tenant { "Cannot run GC iteration on inactive tenant" ); + { + let conf = self.tenant_conf.read().unwrap(); + + if !conf.location.may_delete_layers_hint() { + info!("Skipping GC in location state {:?}", conf.location); + return Ok(GcResult::default()); + } + } + self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) .await } @@ -1664,6 +1688,14 @@ impl Tenant { "Cannot run compaction iteration on inactive tenant" ); + { + let conf = self.tenant_conf.read().unwrap(); + if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { + info!("Skipping compaction in location state {:?}", conf.location); + return Ok(()); + } + } + // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -2089,7 +2121,7 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - *self.tenant_conf.read().unwrap() + self.tenant_conf.read().unwrap().tenant_conf } pub fn effective_config(&self) -> TenantConf { @@ -2098,84 +2130,95 @@ impl Tenant { } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - *self.tenant_conf.write().unwrap() = new_tenant_conf; + self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; + // Don't hold self.timelines.lock() during the notifies. + // There's no risk of deadlock right now, but there could be if we consolidate + // mutexes in struct Timeline in the future. + let timelines = self.list_timelines(); + for timeline in timelines { + timeline.tenant_conf_updated(); + } + } + + pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { + *self.tenant_conf.write().unwrap() = new_conf; // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. @@ -2245,10 +2288,9 @@ impl Tenant { fn new( state: TenantState, conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, + attached_conf: AttachedTenantConf, walredo_mgr: Arc, tenant_id: TenantId, - generation: Generation, remote_storage: Option, deletion_queue_client: DeletionQueueClient, ) -> Tenant { @@ -2308,12 +2350,12 @@ impl Tenant { Tenant { tenant_id, - generation, + generation: attached_conf.location.generation, conf, // using now here is good enough approximation to catch tenants with really long // activation times. loading_started_at: Instant::now(), - tenant_conf: Arc::new(RwLock::new(tenant_conf)), + tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, @@ -2331,52 +2373,123 @@ impl Tenant { pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_id: &TenantId, - ) -> anyhow::Result { - let target_config_path = conf.tenant_config_path(tenant_id); + ) -> anyhow::Result { + let legacy_config_path = conf.tenant_config_path(tenant_id); + let config_path = conf.tenant_location_config_path(tenant_id); - info!("loading tenantconf from {target_config_path}"); + if config_path.exists() { + // New-style config takes precedence + let deserialized = Self::read_config(&config_path)?; + Ok(toml_edit::de::from_document::(deserialized)?) + } else if legacy_config_path.exists() { + // Upgrade path: found an old-style configuration only + let deserialized = Self::read_config(&legacy_config_path)?; - // FIXME If the config file is not found, assume that we're attaching - // a detached tenant and config is passed via attach command. - // https://github.com/neondatabase/neon/issues/1555 - // OR: we're loading after incomplete deletion that managed to remove config. - if !target_config_path.exists() { - info!("tenant config not found in {target_config_path}"); - return Ok(TenantConfOpt::default()); + let mut tenant_conf = TenantConfOpt::default(); + for (key, item) in deserialized.iter() { + match key { + "tenant_config" => { + tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { + format!("Failed to parse config from file '{legacy_config_path}' as pageserver config") + })?; + } + _ => bail!( + "config file {legacy_config_path} has unrecognized pageserver option '{key}'" + ), + } + } + + // Legacy configs are implicitly in attached state + Ok(LocationConf::attached_single( + tenant_conf, + Generation::none(), + )) + } else { + // FIXME If the config file is not found, assume that we're attaching + // a detached tenant and config is passed via attach command. + // https://github.com/neondatabase/neon/issues/1555 + // OR: we're loading after incomplete deletion that managed to remove config. + info!( + "tenant config not found in {} or {}", + config_path, legacy_config_path + ); + Ok(LocationConf::default()) } + } + + fn read_config(path: &Utf8Path) -> anyhow::Result { + info!("loading tenant configuration from {path}"); // load and parse file - let config = fs::read_to_string(&target_config_path) - .with_context(|| format!("Failed to load config from path '{target_config_path}'"))?; + let config = fs::read_to_string(path) + .with_context(|| format!("Failed to load config from path '{path}'"))?; - let toml = config.parse::().with_context(|| { - format!("Failed to parse config from file '{target_config_path}' as toml file") - })?; - - let mut tenant_conf = TenantConfOpt::default(); - for (key, item) in toml.iter() { - match key { - "tenant_config" => { - tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { - format!("Failed to parse config from file '{target_config_path}' as pageserver config") - })?; - } - _ => bail!( - "config file {target_config_path} has unrecognized pageserver option '{key}'" - ), - } - } - - Ok(tenant_conf) + config + .parse::() + .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) } #[tracing::instrument(skip_all, fields(%tenant_id))] pub(super) async fn persist_tenant_config( + conf: &'static PageServerConf, + tenant_id: &TenantId, + location_conf: &LocationConf, + ) -> anyhow::Result<()> { + let legacy_config_path = conf.tenant_config_path(tenant_id); + let config_path = conf.tenant_location_config_path(tenant_id); + Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf) + .await + } + + #[tracing::instrument(skip_all, fields(%tenant_id))] + pub(super) async fn persist_tenant_config_at( + tenant_id: &TenantId, + config_path: &Utf8Path, + legacy_config_path: &Utf8Path, + location_conf: &LocationConf, + ) -> anyhow::Result<()> { + // Forward compat: write out an old-style configuration that old versions can read, in case we roll back + Self::persist_tenant_config_legacy( + tenant_id, + legacy_config_path, + &location_conf.tenant_conf, + ) + .await?; + + if let LocationMode::Attached(attach_conf) = &location_conf.mode { + // Once we use LocationMode, generations are mandatory. If we aren't using generations, + // then drop out after writing legacy-style config. + if attach_conf.generation.is_none() { + tracing::debug!("Running without generations, not writing new-style LocationConf"); + return Ok(()); + } + } + + info!("persisting tenantconf to {config_path}"); + + let mut conf_content = r#"# This file contains a specific per-tenant's config. +# It is read in case of pageserver restart. +"# + .to_string(); + + // Convert the config to a toml file. + conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; + + let conf_content = conf_content.as_bytes(); + + let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); + VirtualFile::crashsafe_overwrite(config_path, &temp_path, conf_content) + .await + .with_context(|| format!("write tenant {tenant_id} config to {config_path}"))?; + Ok(()) + } + + #[tracing::instrument(skip_all, fields(%tenant_id))] + async fn persist_tenant_config_legacy( tenant_id: &TenantId, target_config_path: &Utf8Path, - tenant_conf: TenantConfOpt, + tenant_conf: &TenantConfOpt, ) -> anyhow::Result<()> { - // imitate a try-block with a closure info!("persisting tenantconf to {target_config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -3076,7 +3189,7 @@ pub(crate) enum CreateTenantFilesMode { pub(crate) async fn create_tenant_files( conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, + location_conf: &LocationConf, tenant_id: &TenantId, mode: CreateTenantFilesMode, ) -> anyhow::Result { @@ -3099,7 +3212,7 @@ pub(crate) async fn create_tenant_files( let creation_result = try_create_target_tenant_dir( conf, - tenant_conf, + location_conf, tenant_id, mode, &temporary_tenant_dir, @@ -3125,7 +3238,7 @@ pub(crate) async fn create_tenant_files( async fn try_create_target_tenant_dir( conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, + location_conf: &LocationConf, tenant_id: &TenantId, mode: CreateTenantFilesMode, temporary_tenant_dir: &Utf8Path, @@ -3155,14 +3268,26 @@ async fn try_create_target_tenant_dir( temporary_tenant_dir, ) .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?; - let temporary_tenant_config_path = rebase_directory( + let temporary_legacy_tenant_config_path = rebase_directory( &conf.tenant_config_path(tenant_id), target_tenant_directory, temporary_tenant_dir, ) .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + let temporary_tenant_config_path = rebase_directory( + &conf.tenant_location_config_path(tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; - Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?; + Tenant::persist_tenant_config_at( + tenant_id, + &temporary_tenant_config_path, + &temporary_legacy_tenant_config_path, + location_conf, + ) + .await?; crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -3443,10 +3568,13 @@ pub mod harness { let tenant = Arc::new(Tenant::new( TenantState::Loading, self.conf, - TenantConfOpt::from(self.tenant_conf), + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt::from(self.tenant_conf), + self.generation, + )) + .unwrap(), walredo_mgr, self.tenant_id, - self.generation, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), )); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index ffe2c5eab6..5f8c7f6c59 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -13,6 +13,7 @@ use pageserver_api::models; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::time::Duration; +use utils::generation::Generation; pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB @@ -44,7 +45,211 @@ pub mod defaults { pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; } -/// Per-tenant configuration options +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) enum AttachmentMode { + /// Our generation is current as far as we know, and as far as we know we are the only attached + /// pageserver. This is the "normal" attachment mode. + Single, + /// Our generation number is current as far as we know, but we are advised that another + /// pageserver is still attached, and therefore to avoid executing deletions. This is + /// the attachment mode of a pagesever that is the destination of a migration. + Multi, + /// Our generation number is superseded, or about to be superseded. We are advised + /// to avoid remote storage writes if possible, and to avoid sending billing data. This + /// is the attachment mode of a pageserver that is the origin of a migration. + Stale, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) struct AttachedLocationConfig { + pub(crate) generation: Generation, + pub(crate) attach_mode: AttachmentMode, + // TODO: add a flag to override AttachmentMode's policies under + // disk pressure (i.e. unblock uploads under disk pressure in Stale + // state, unblock deletions after timeout in Multi state) +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) struct SecondaryLocationConfig { + /// If true, keep the local cache warm by polling remote storage + pub(crate) warm: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) enum LocationMode { + Attached(AttachedLocationConfig), + Secondary(SecondaryLocationConfig), +} + +/// Per-tenant, per-pageserver configuration. All pageservers use the same TenantConf, +/// but have distinct LocationConf. +#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct LocationConf { + /// The location-specific part of the configuration, describes the operating + /// mode of this pageserver for this tenant. + pub(crate) mode: LocationMode, + /// The pan-cluster tenant configuration, the same on all locations + pub(crate) tenant_conf: TenantConfOpt, +} + +impl std::fmt::Debug for LocationConf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.mode { + LocationMode::Attached(conf) => { + write!( + f, + "Attached {:?}, gen={:?}", + conf.attach_mode, conf.generation + ) + } + LocationMode::Secondary(conf) => { + write!(f, "Secondary, warm={}", conf.warm) + } + } + } +} + +impl AttachedLocationConfig { + /// Consult attachment mode to determine whether we are currently permitted + /// to delete layers. This is only advisory, not required for data safety. + /// See [`AttachmentMode`] for more context. + pub(crate) fn may_delete_layers_hint(&self) -> bool { + // TODO: add an override for disk pressure in AttachedLocationConfig, + // and respect it here. + match &self.attach_mode { + AttachmentMode::Single => true, + AttachmentMode::Multi | AttachmentMode::Stale => { + // In Multi mode we avoid doing deletions because some other + // attached pageserver might get 404 while trying to read + // a layer we delete which is still referenced in their metadata. + // + // In Stale mode, we avoid doing deletions because we expect + // that they would ultimately fail validation in the deletion + // queue due to our stale generation. + false + } + } + } + + /// Whether we are currently hinted that it is worthwhile to upload layers. + /// This is only advisory, not required for data safety. + /// See [`AttachmentMode`] for more context. + pub(crate) fn may_upload_layers_hint(&self) -> bool { + // TODO: add an override for disk pressure in AttachedLocationConfig, + // and respect it here. + match &self.attach_mode { + AttachmentMode::Single | AttachmentMode::Multi => true, + AttachmentMode::Stale => { + // In Stale mode, we avoid doing uploads because we expect that + // our replacement pageserver will already have started its own + // IndexPart that will never reference layers we upload: it is + // wasteful. + false + } + } + } +} + +impl LocationConf { + /// For use when loading from a legacy configuration: presence of a tenant + /// implies it is in AttachmentMode::Single, which used to be the only + /// possible state. This function should eventually be removed. + pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self { + Self { + mode: LocationMode::Attached(AttachedLocationConfig { + generation, + attach_mode: AttachmentMode::Single, + }), + tenant_conf, + } + } + + /// For use when attaching/re-attaching: update the generation stored in this + /// structure. If we were in a secondary state, promote to attached (posession + /// of a fresh generation implies this). + pub(crate) fn attach_in_generation(&mut self, generation: Generation) { + match &mut self.mode { + LocationMode::Attached(attach_conf) => { + attach_conf.generation = generation; + } + LocationMode::Secondary(_) => { + // We are promoted to attached by the control plane's re-attach response + self.mode = LocationMode::Attached(AttachedLocationConfig { + generation, + attach_mode: AttachmentMode::Single, + }) + } + } + } + + pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result { + let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?; + + fn get_generation(conf: &'_ models::LocationConfig) -> Result { + conf.generation + .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching")) + } + + let mode = match &conf.mode { + models::LocationConfigMode::AttachedMulti => { + LocationMode::Attached(AttachedLocationConfig { + generation: get_generation(conf)?, + attach_mode: AttachmentMode::Multi, + }) + } + models::LocationConfigMode::AttachedSingle => { + LocationMode::Attached(AttachedLocationConfig { + generation: get_generation(conf)?, + attach_mode: AttachmentMode::Single, + }) + } + models::LocationConfigMode::AttachedStale => { + LocationMode::Attached(AttachedLocationConfig { + generation: get_generation(conf)?, + attach_mode: AttachmentMode::Stale, + }) + } + models::LocationConfigMode::Secondary => { + anyhow::ensure!(conf.generation.is_none()); + + let warm = conf + .secondary_conf + .as_ref() + .map(|c| c.warm) + .unwrap_or(false); + LocationMode::Secondary(SecondaryLocationConfig { warm }) + } + models::LocationConfigMode::Detached => { + // Should not have been called: API code should translate this mode + // into a detach rather than trying to decode it as a LocationConf + return Err(anyhow::anyhow!("Cannot decode a Detached configuration")); + } + }; + + Ok(Self { mode, tenant_conf }) + } +} + +impl Default for LocationConf { + // TODO: this should be removed once tenant loading can guarantee that we are never + // loading from a directory without a configuration. + // => tech debt since https://github.com/neondatabase/neon/issues/1555 + fn default() -> Self { + Self { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: Generation::none(), + attach_mode: AttachmentMode::Single, + }), + tenant_conf: TenantConfOpt::default(), + } + } +} + +/// A tenant's calcuated configuration, which is the result of merging a +/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. +/// +/// For storing and transmitting individual tenant's configuration, see +/// TenantConfOpt. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct TenantConf { // Flush out an inmemory layer, if it's holding WAL older than this diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 80e3f30150..d8763b0a64 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -197,6 +197,7 @@ async fn cleanup_remaining_fs_traces( }; rm(conf.tenant_config_path(tenant_id), false).await?; + rm(conf.tenant_location_config_path(tenant_id), false).await?; fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { Err(anyhow::anyhow!( diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3abf50cf1e..7fbcb5041a 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -24,9 +24,11 @@ use crate::control_plane_client::{ }; use crate::deletion_queue::DeletionQueueClient; use crate::task_mgr::{self, TaskKind}; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt}; use crate::tenant::delete::DeleteTenantFlow; -use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; +use crate::tenant::{ + create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState, +}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -38,6 +40,39 @@ use super::delete::DeleteTenantError; use super::timeline::delete::DeleteTimelineFlow; use super::TenantSharedResources; +/// For a tenant that appears in TenantsMap, it may either be +/// - `Attached`: has a full Tenant object, is elegible to service +/// reads and ingest WAL. +/// - `Secondary`: is only keeping a local cache warm. +/// +/// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because +/// that way we avoid having to carefully switch a tenant's ingestion etc on and off during +/// its lifetime, and we can preserve some important safety invariants like `Tenant` always +/// having a properly acquired generation (Secondary doesn't need a generation) +#[derive(Clone)] +pub enum TenantSlot { + Attached(Arc), + Secondary, +} + +impl TenantSlot { + /// Return the `Tenant` in this slot if attached, else None + fn get_attached(&self) -> Option<&Arc> { + match self { + Self::Attached(t) => Some(t), + Self::Secondary => None, + } + } + + /// Consume self and return the `Tenant` that was in this slot if attached, else None + fn into_attached(self) -> Option> { + match self { + Self::Attached(t) => Some(t), + Self::Secondary => None, + } + } +} + /// The tenants known to the pageserver. /// The enum variants are used to distinguish the different states that the pageserver can be in. pub(crate) enum TenantsMap { @@ -45,14 +80,27 @@ pub(crate) enum TenantsMap { Initializing, /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_insert`]. - Open(HashMap>), + Open(HashMap), /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. /// Existing tenants are still accessible, but no new tenants can be created. - ShuttingDown(HashMap>), + ShuttingDown(HashMap), } impl TenantsMap { + /// Convenience function for typical usage, where we want to get a `Tenant` object, for + /// working with attached tenants. If the TenantId is in the map but in Secondary state, + /// None is returned. pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + match self { + TenantsMap::Initializing => None, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + m.get(tenant_id).and_then(TenantSlot::get_attached) + } + } + } + + /// Get the contents of the map at this tenant ID, even if it is in secondary state. + pub(crate) fn get_slot(&self, tenant_id: &TenantId) -> Option<&TenantSlot> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id), @@ -61,7 +109,9 @@ impl TenantsMap { pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option> { match self { TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + m.remove(tenant_id).and_then(TenantSlot::into_attached) + } } } } @@ -205,19 +255,59 @@ pub async fn init_tenant_mgr( } }; + // Try loading the location configuration + let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id) + .context("load tenant config") + { + Ok(c) => c, + Err(e) => { + warn!("Marking tenant broken, failed to {e:#}"); + + tenants.insert( + tenant_id, + TenantSlot::Attached(Tenant::create_broken_tenant( + conf, + tenant_id, + "error loading tenant location configuration".to_string(), + )), + ); + + continue; + } + }; + let generation = if let Some(generations) = &tenant_generations { // We have a generation map: treat it as the authority for whether // this tenant is really attached. if let Some(gen) = generations.get(&tenant_id) { *gen } else { - info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!( - "Failed to remove detached tenant directory '{}': {:?}", - tenant_dir_path, e - ); - } + match &location_conf.mode { + LocationMode::Secondary(_) => { + // We do not require the control plane's permission for secondary mode + // tenants, because they do no remote writes and hence require no + // generation number + info!("Loaded tenant {tenant_id} in secondary mode"); + tenants.insert(tenant_id, TenantSlot::Secondary); + } + LocationMode::Attached(_) => { + // TODO: augment re-attach API to enable the control plane to + // instruct us about secondary attachments. That way, instead of throwing + // away local state, we can gracefully fall back to secondary here, if the control + // plane tells us so. + // (https://github.com/neondatabase/neon/issues/5377) + info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response"); + if let Err(e) = + safe_remove_tenant_dir_all(&tenant_dir_path).await + { + error!( + "Failed to remove detached tenant directory '{}': {:?}", + tenant_dir_path, e + ); + } + } + }; + continue; } } else { @@ -230,18 +320,23 @@ pub async fn init_tenant_mgr( Generation::none() }; + // Presence of a generation number implies attachment: attach the tenant + // if it wasn't already, and apply the generation number. + location_conf.attach_in_generation(generation); + Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + match schedule_local_tenant_processing( conf, tenant_id, &tenant_dir_path, - generation, + AttachedTenantConf::try_from(location_conf)?, resources.clone(), Some(init_order.clone()), &TENANTS, &ctx, ) { Ok(tenant) => { - tenants.insert(tenant.tenant_id(), tenant); + tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant)); } Err(e) => { error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"); @@ -273,7 +368,7 @@ pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_id: TenantId, tenant_path: &Utf8Path, - generation: Generation, + location_conf: AttachedTenantConf, resources: TenantSharedResources, init_order: Option, tenants: &'static tokio::sync::RwLock, @@ -310,7 +405,7 @@ pub(crate) fn schedule_local_tenant_processing( "attaching mark file present but no remote storage configured".to_string(), ) } else { - match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) { + match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) { Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); @@ -322,7 +417,13 @@ pub(crate) fn schedule_local_tenant_processing( info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. Tenant::spawn_load( - conf, tenant_id, generation, resources, init_order, tenants, ctx, + conf, + tenant_id, + location_conf, + resources, + init_order, + tenants, + ctx, ) }; Ok(tenant) @@ -378,7 +479,16 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock) { let res = { let (_guard, shutdown_progress) = completion::channel(); - tenant.shutdown(shutdown_progress, freeze_and_flush).await + match tenant { + TenantSlot::Attached(t) => { + t.shutdown(shutdown_progress, freeze_and_flush).await + } + TenantSlot::Secondary => { + // TODO: once secondary mode downloads are implemented, + // ensure they have all stopped before we reach this point. + Ok(()) + } + } }; if let Err(other_progress) = res { @@ -451,16 +561,19 @@ pub async fn create_tenant( ctx: &RequestContext, ) -> Result, TenantMapInsertError> { tenant_map_insert(tenant_id, || async { + + let location_conf = LocationConf::attached_single(tenant_conf, generation); + // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?; + let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 let created_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, - generation, resources, None, &TENANTS, ctx)?; + AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -489,14 +602,126 @@ pub async fn set_new_tenant_config( info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; - let tenant_config_path = conf.tenant_config_path(&tenant_id); - Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf) + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); + + Tenant::persist_tenant_config(conf, &tenant_id, &location_conf) .await .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); Ok(()) } +#[instrument(skip_all, fields(tenant_id, new_location_config))] +pub(crate) async fn upsert_location( + conf: &'static PageServerConf, + tenant_id: TenantId, + new_location_config: LocationConf, + broker_client: storage_broker::BrokerClientChannel, + remote_storage: Option, + deletion_queue_client: DeletionQueueClient, + ctx: &RequestContext, +) -> Result<(), anyhow::Error> { + info!("configuring tenant location {tenant_id} to state {new_location_config:?}"); + + let mut existing_tenant = match get_tenant(tenant_id, false).await { + Ok(t) => Some(t), + Err(GetTenantError::NotFound(_)) => None, + Err(e) => anyhow::bail!(e), + }; + + // If we need to shut down a Tenant, do that first + let shutdown_tenant = match (&new_location_config.mode, &existing_tenant) { + (LocationMode::Secondary(_), Some(t)) => Some(t), + (LocationMode::Attached(attach_conf), Some(t)) => { + if attach_conf.generation != t.generation { + Some(t) + } else { + None + } + } + _ => None, + }; + + // TODO: currently we risk concurrent operations interfering with the tenant + // while we await shutdown, but we also should not hold the TenantsMap lock + // across the whole operation. Before we start using this function in production, + // a follow-on change will revise how concurrency is handled in TenantsMap. + // (https://github.com/neondatabase/neon/issues/5378) + + if let Some(tenant) = shutdown_tenant { + let (_guard, progress) = utils::completion::channel(); + info!("Shutting down attached tenant"); + match tenant.shutdown(progress, false).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } + } + existing_tenant = None; + } + + if let Some(tenant) = existing_tenant { + // Update the existing tenant + Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + tenant.set_new_location_config(AttachedTenantConf::try_from(new_location_config)?); + } else { + // Upsert a fresh TenantSlot into TenantsMap. Do it within the map write lock, + // and re-check that the state of anything we are replacing is as expected. + tenant_map_upsert_slot(tenant_id, |old_value| async move { + if let Some(TenantSlot::Attached(t)) = old_value { + if !matches!(t.current_state(), TenantState::Stopping { .. }) { + anyhow::bail!("Tenant state changed during location configuration update"); + } + } + + let new_slot = match &new_location_config.mode { + LocationMode::Secondary(_) => TenantSlot::Secondary, + LocationMode::Attached(_attach_config) => { + // Do a schedule_local_tenant_processing + // FIXME: should avoid doing this disk I/O inside the TenantsMap lock, + // we have the same problem in load_tenant/attach_tenant. Probably + // need a lock in TenantSlot to fix this. + Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + let tenant_path = conf.tenant_path(&tenant_id); + let resources = TenantSharedResources { + broker_client, + remote_storage, + deletion_queue_client, + }; + let new_tenant = schedule_local_tenant_processing( + conf, + tenant_id, + &tenant_path, + AttachedTenantConf::try_from(new_location_config)?, + resources, + None, + &TENANTS, + ctx, + ) + .with_context(|| { + format!("Failed to schedule tenant processing in path {tenant_path:?}") + })?; + + TenantSlot::Attached(new_tenant) + } + }; + + Ok(new_slot) + }) + .await?; + } + + Ok(()) +} + #[derive(Debug, thiserror::Error)] pub enum GetTenantError { #[error("Tenant {0} not found")] @@ -657,7 +882,12 @@ pub async fn load_tenant( remote_storage, deletion_queue_client }; - let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx) + + let mut location_conf = Tenant::load_tenant_config(conf, &tenant_id).map_err( TenantMapInsertError::Other)?; + location_conf.attach_in_generation(generation); + Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + + let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -710,7 +940,10 @@ pub async fn list_tenants() -> Result, TenantMapLis TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, }; Ok(m.iter() - .map(|(id, tenant)| (*id, tenant.current_state())) + .filter_map(|(id, tenant)| match tenant { + TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), + TenantSlot::Secondary => None, + }) .collect()) } @@ -727,7 +960,8 @@ pub async fn attach_tenant( ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { - let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?; + let location_conf = LocationConf::attached_single(tenant_conf, generation); + let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 @@ -738,8 +972,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - - let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -762,8 +995,10 @@ pub enum TenantMapInsertError { ShuttingDown, #[error("tenant {0} already exists, state: {1:?}")] TenantAlreadyExists(TenantId, TenantState), + #[error("tenant {0} already exists in secondary state")] + TenantExistsSecondary(TenantId), #[error(transparent)] - Closure(#[from] anyhow::Error), + Other(#[from] anyhow::Error), } /// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that @@ -787,20 +1022,47 @@ where TenantsMap::Open(m) => m, }; match m.entry(tenant_id) { - hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists( - tenant_id, - e.get().current_state(), - )), + hash_map::Entry::Occupied(e) => match e.get() { + TenantSlot::Attached(t) => Err(TenantMapInsertError::TenantAlreadyExists( + tenant_id, + t.current_state(), + )), + TenantSlot::Secondary => Err(TenantMapInsertError::TenantExistsSecondary(tenant_id)), + }, hash_map::Entry::Vacant(v) => match insert_fn().await { Ok(tenant) => { - v.insert(tenant.clone()); + v.insert(TenantSlot::Attached(tenant.clone())); Ok(tenant) } - Err(e) => Err(TenantMapInsertError::Closure(e)), + Err(e) => Err(TenantMapInsertError::Other(e)), }, } } +async fn tenant_map_upsert_slot<'a, F, R>( + tenant_id: TenantId, + upsert_fn: F, +) -> Result<(), TenantMapInsertError> +where + F: FnOnce(Option) -> R, + R: std::future::Future>, +{ + let mut guard = TENANTS.write().await; + let m = match &mut *guard { + TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing), + TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown), + TenantsMap::Open(m) => m, + }; + + match upsert_fn(m.remove(&tenant_id)).await { + Ok(upsert_val) => { + m.insert(tenant_id, upsert_val); + Ok(()) + } + Err(e) => Err(TenantMapInsertError::Other(e)), + } +} + /// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise. /// Allows to remove other tenant resources manually, via `tenant_cleanup`. /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal @@ -820,28 +1082,40 @@ where // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to // avoid holding the lock for the entire process. let tenant = { - tenants + match tenants .write() .await - .get(&tenant_id) - .cloned() + .get_slot(&tenant_id) .ok_or(TenantStateError::NotFound(tenant_id))? + { + TenantSlot::Attached(t) => Some(t.clone()), + TenantSlot::Secondary => None, + } }; // allow pageserver shutdown to await for our completion let (_guard, progress) = completion::channel(); - // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + // If the tenant was attached, shut it down gracefully. For secondary + // locations this part is not necessary + match tenant { + Some(attached_tenant) => { + // whenever we remove a tenant from memory, we don't want to flush and wait for upload + let freeze_and_flush = false; - // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so - // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { - Ok(()) => {} - Err(_other) => { - // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to - // wait for it but return an error right away because these are distinct requests. - return Err(TenantStateError::IsStopping(tenant_id)); + // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so + // that we can continue safely to cleanup. + match attached_tenant.shutdown(progress, freeze_and_flush).await { + Ok(()) => {} + Err(_other) => { + // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to + // wait for it but return an error right away because these are distinct requests. + return Err(TenantStateError::IsStopping(tenant_id)); + } + } + } + None => { + // Nothing to wait on when not attached, proceed. } } @@ -932,6 +1206,8 @@ mod tests { use std::sync::Arc; use tracing::{info_span, Instrument}; + use crate::tenant::mgr::TenantSlot; + use super::{super::harness::TenantHarness, TenantsMap}; #[tokio::test(start_paused = true)] @@ -953,7 +1229,7 @@ mod tests { // tenant harness configures the logging and we cannot escape it let _e = info_span!("testing", tenant_id = %id).entered(); - let tenants = HashMap::from([(id, t.clone())]); + let tenants = HashMap::from([(id, TenantSlot::Attached(t.clone()))]); let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants))); let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5b670e7dc5..4f28b03c67 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -91,12 +91,12 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; -use super::debug_assert_current_span_has_tenant_and_timeline_id; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{ AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc, }; +use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -149,7 +149,7 @@ pub struct TimelineResources { pub struct Timeline { conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, myself: Weak, @@ -158,6 +158,9 @@ pub struct Timeline { /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects. /// Never changes for the lifetime of this [`Timeline`] object. + /// + /// This duplicates the generation stored in LocationConf, but that structure is mutable: + /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. generation: Generation, pub pg_version: u32, @@ -1378,42 +1381,42 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) @@ -1429,7 +1432,7 @@ impl Timeline { } fn get_gc_feedback(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap(); + let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; tenant_conf .gc_feedback .unwrap_or(self.conf.default_tenant_conf.gc_feedback) @@ -1442,7 +1445,7 @@ impl Timeline { // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap(), + &self.tenant_conf.read().unwrap().tenant_conf, &self.conf.default_tenant_conf, ); let tenant_id_str = self.tenant_id.to_string(); @@ -1461,7 +1464,7 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, @@ -1484,7 +1487,7 @@ impl Timeline { let evictions_low_residence_duration_metric_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard, + &tenant_conf_guard.tenant_conf, &conf.default_tenant_conf, ); drop(tenant_conf_guard); @@ -1649,12 +1652,15 @@ impl Timeline { let tenant_conf_guard = self.tenant_conf.read().unwrap(); let wal_connect_timeout = tenant_conf_guard + .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); let lagging_wal_timeout = tenant_conf_guard + .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); let max_lsn_wal_lag = tenant_conf_guard + .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard);