diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 15c37b9453..e4df81c9ad 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -48,13 +48,33 @@ pub enum TenantState { } impl TenantState { - pub fn has_in_progress_downloads(&self) -> bool { + pub fn attachment_status(&self) -> TenantAttachmentStatus { + use TenantAttachmentStatus::*; match self { - Self::Loading => true, - Self::Attaching => true, - Self::Active => false, - Self::Stopping => false, - Self::Broken { .. } => false, + // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. + // So, technically, we can return Attached here. + // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. + // But, our attach task might still be fetching the remote timelines, etc. + // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. + Self::Attaching => Maybe, + // tenant mgr startup distinguishes attaching from loading via marker file. + // If it's loading, there is no attach marker file, i.e., attach had finished in the past. + Self::Loading => Attached, + // We only reach Active after successful load / attach. + // So, call atttachment status Attached. + Self::Active => Attached, + // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. + // However, it also becomes Broken if the regular load fails. + // We would need a separate TenantState variant to distinguish these cases. + // However, there's no practical difference from Console's perspective. + // It will run a Postgres-level health check as soon as it observes Attached. + // That will fail on Broken tenants. + // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. + Self::Broken { .. } => Attached, + // Why is Stopping a Maybe case? Because, during pageserver shutdown, + // we set the Stopping state irrespective of whether the tenant + // has finished attaching or not. + Self::Stopping => Maybe, } } @@ -209,16 +229,25 @@ impl TenantConfigRequest { } } +/// See [`TenantState::attachment_status`] and the OpenAPI docs for context. +#[derive(Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantAttachmentStatus { + Maybe, + Attached, +} + #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, + // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint - pub has_in_progress_downloads: Option, + pub attachment_status: TenantAttachmentStatus, } /// This represents the output of the "timeline_detail" and "timeline_list" API calls. @@ -691,7 +720,7 @@ mod tests { id: TenantId::generate(), state: TenantState::Active, current_physical_size: Some(42), - has_in_progress_downloads: Some(false), + attachment_status: TenantAttachmentStatus::Attached, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -699,7 +728,7 @@ mod tests { "slug": "Active", }, "current_physical_size": 42, - "has_in_progress_downloads": false, + "attachment_status": "attached", }); let original_broken = TenantInfo { @@ -709,7 +738,7 @@ mod tests { backtrace: "backtrace info".into(), }, current_physical_size: Some(42), - has_in_progress_downloads: Some(false), + attachment_status: TenantAttachmentStatus::Attached, }; let expected_broken = json!({ "id": original_broken.id.to_string(), @@ -721,7 +750,7 @@ mod tests { } }, "current_physical_size": 42, - "has_in_progress_downloads": false, + "attachment_status": "attached", }); assert_eq!( diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 877b367b40..330587310f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -346,23 +346,23 @@ paths: starts writing to the tenant's S3 state unless it receives one of the distinguished errors below that state otherwise. - The method to identify whether a request has arrived at the pageserver, and - whether it has succeeded, is to poll for the tenant status to reach "Active" - or "Broken" state. These values are currently not explicitly documented in - the API spec. - Polling for `has_in_progress_downloads == false` is INCORRECT because that - value can turn `false` during shutdown while the Attach operation is still - unfinished. + If a client receives a not-distinguished response, e.g., a network timeout, + it MUST retry the /attach request and poll again for the tenant's + attachment status. + + After the client has received a 202, it MUST poll the tenant's + attachment status (field `attachment_status`) to reach state `attached`. + If the `attachment_status` is missing, the client MUST retry the `/attach` + request (goto previous paragraph). This is a robustness measure in case the tenant + status endpoint is buggy, but the attach operation is ongoing. There is no way to cancel an in-flight request. - If a client receives a not-distinguished response, e.g., a network timeout, - it MUST retry the /attach request and poll again for tenant status. - - In any case, it must - * NOT ASSUME that the /attach request has been lost in the network, - * NOT ASSUME that the request has been lost based on a subsequent - tenant status request returning 404. (The request may still be in flight!) + In any case, the client + * MUST NOT ASSUME that the /attach request has been lost in the network, + * MUST NOT ASSUME that the request has been lost, based on the observation + that a subsequent tenant status request returns 404. The request may + still be in flight. It must be retried. responses: "202": description: Tenant attaching scheduled @@ -888,13 +888,27 @@ components: type: object required: - id + - attachment_status properties: id: type: string current_physical_size: type: integer - has_in_progress_downloads: - type: boolean + attachment_status: + description: | + Status of this tenant's attachment to this pageserver. + + - `maybe` means almost nothing, don't read anything into it + except for the fact that the pageserver _might_ be already + writing to the tenant's S3 state, so, DO NOT ATTACH the + tenant to any other pageserver, or we risk split-brain. + - `attached` means that the attach operation has completed, + maybe successfully, maybe not. Perform a health check at + the Postgres level to determine healthiness of the tenant. + + See the tenant `/attach` endpoint for more information. + type: string + enum: [ "maybe", "attached" ] TenantCreateInfo: type: object properties: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 58ee7367ca..19d8243b37 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -467,7 +467,7 @@ async fn tenant_list_handler(request: Request) -> Result, A id: *id, state: state.clone(), current_physical_size: None, - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) .collect::>(); @@ -492,7 +492,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro id: tenant_id, state: state.clone(), current_physical_size: Some(current_physical_size), - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) } .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1c6006493e..f97afe9cc9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -593,16 +593,19 @@ impl Tenant { /// finishes. You can use wait_until_active() to wait for the task to /// complete. /// - pub fn spawn_attach( + pub(crate) fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: GenericRemoteStorage, ctx: &RequestContext, - ) -> Arc { + ) -> anyhow::Result> { // XXX: Attach should provide the config, especially during tenant migration. // See https://github.com/neondatabase/neon/issues/1555 let tenant_conf = TenantConfOpt::default(); + Self::attach_idempotent_create_marker_file(conf, tenant_id) + .context("create attach marker file")?; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( TenantState::Attaching, @@ -635,7 +638,46 @@ impl Tenant { Ok(()) }, ); - tenant + Ok(tenant) + } + + fn attach_idempotent_create_marker_file( + conf: &'static PageServerConf, + tenant_id: TenantId, + ) -> anyhow::Result<()> { + // Create directory with marker file to indicate attaching state. + // The load_local_tenants() function in tenant::mgr relies on the marker file + // to determine whether a tenant has finished attaching. + let tenant_dir = conf.tenant_path(&tenant_id); + let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id); + debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); + // TODO: should use tokio::fs here, but + // 1. caller is not async, for good reason (it holds tenants map lock) + // 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future + // doesn't wait for the activity in the fs thread pool. + crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; + match fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&marker_file) + { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // Either this is a retry of attach or there is a concurrent task also doing attach for this tenant. + // We cannot distinguish this here. + // The caller is responsible for ensuring there's no concurrent attach for a tenant. + {} // fsync again, we don't know if that already happened + } + err => { + err.context("create tenant attaching marker file")?; + unreachable!("we covered the Ok() case above"); + } + } + crashsafe::fsync_file_and_parent(&marker_file) + .context("fsync tenant attaching marker file and parent")?; + debug_assert!(tenant_dir.is_dir()); + debug_assert!(marker_file.is_file()); + Ok(()) } /// @@ -643,26 +685,15 @@ impl Tenant { /// #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { - // Create directory with marker file to indicate attaching state. - // The load_local_tenants() function in tenant::mgr relies on the marker file - // to determine whether a tenant has finished attaching. - let tenant_dir = self.conf.tenant_path(&self.tenant_id); let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); - debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); - if tenant_dir.exists() { - if !marker_file.is_file() { - anyhow::bail!( - "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}", - tenant_dir.display(), marker_file.display()); - } - } else { - crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; - fs::File::create(&marker_file).context("create tenant attaching marker file")?; - crashsafe::fsync_file_and_parent(&marker_file) - .context("fsync tenant attaching marker file and parent")?; + if !tokio::fs::try_exists(&marker_file) + .await + .context("check for existence of marker file")? + { + anyhow::bail!( + "implementation error: marker file should exist at beginning of this function" + ); } - debug_assert!(tenant_dir.is_dir()); - debug_assert!(marker_file.is_file()); // Get list of remote timelines // download index files for every tenant timeline @@ -839,11 +870,15 @@ impl Tenant { } /// Create a placeholder Tenant object for a broken tenant - pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc { + pub fn create_broken_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + reason: String, + ) -> Arc { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); Arc::new(Tenant::new( TenantState::Broken { - reason: "create_broken_tenant".into(), + reason, backtrace: String::new(), }, conf, @@ -876,7 +911,7 @@ impl Tenant { Ok(conf) => conf, Err(e) => { error!("load tenant config failed: {:?}", e); - return Tenant::create_broken_tenant(conf, tenant_id); + return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")); } }; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 754316b3cd..8191880bb5 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); if let Some(remote_storage) = remote_storage { - Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) + match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) { + Ok(tenant) => tenant, + Err(e) => { + error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + } + } } else { warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant(conf, tenant_id) + Tenant::create_broken_tenant( + conf, + tenant_id, + "attaching mark file present but no remote storage configured".to_string(), + ) } } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); @@ -466,7 +476,8 @@ pub async fn attach_tenant( "Cannot attach tenant {tenant_id}, local tenant directory already exists" ); - let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx); + let tenant = + Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?; vacant_entry.insert(tenant); Ok(()) })