From ab751abdbd52eb35e5b1e92b41ec5761c898f36b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 21 May 2025 03:02:39 +0200 Subject: [PATCH] Initial draft --- libs/pageserver_api/src/models.rs | 2 +- pageserver/src/tenant.rs | 108 +++++++++++++++++- .../src/tenant/remote_timeline_client.rs | 46 +++++++- .../tenant/remote_timeline_client/download.rs | 38 +++++- .../tenant/remote_timeline_client/index.rs | 6 + 5 files changed, 190 insertions(+), 10 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5c86b37df4..4f3d86a962 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -408,7 +408,7 @@ pub enum TimelineCreateRequestMode { import_pgdata: TimelineCreateRequestModeImportPgdata, }, Template { - template_tenant_id: TenantId, + template_tenant_id: TenantShardId, template_timeline_id: TimelineId, }, // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b2a509ed60..367cdea1cc 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -49,6 +49,7 @@ use remote_timeline_client::{ }; use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; +use storage_layer::Layer; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; use timeline::import_pgdata::ImportingTimeline; use timeline::offload::{OffloadError, offload_timeline}; @@ -107,7 +108,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mg static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); use utils::crashsafe; use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; @@ -879,7 +880,7 @@ pub(crate) struct CreateTimelineParamsBootstrap { #[derive(Debug)] pub(crate) struct CreateTimelineParamsTemplate { pub(crate) new_timeline_id: TimelineId, - pub(crate) template_tenant_id: TenantId, + pub(crate) template_tenant_id: TenantShardId, pub(crate) template_timeline_id: TimelineId, } @@ -929,6 +930,7 @@ pub(crate) enum CreateTimelineIdempotency { ancestor_start_lsn: Lsn, }, ImportPgdata(CreatingTimelineIdempotencyImportPgdata), + Template(TenantShardId, TimelineId), } #[derive(Debug, Clone, PartialEq, Eq)] @@ -2750,7 +2752,107 @@ impl TenantShard { template_tenant_id, template_timeline_id, } = params; - todo!() + + // 0. create a guard to prevent parallel creation attempts. + let timeline_create_guard = match self + .start_creating_timeline( + new_timeline_id, + CreateTimelineIdempotency::Template(template_tenant_id, template_timeline_id), + ) + .await? + { + StartCreatingTimelineResult::CreateGuard(guard) => guard, + StartCreatingTimelineResult::Idempotent(timeline) => { + return Ok(CreateTimelineResult::Idempotent(timeline)); + } + }; + + // 1. download the index part of the template timeline + // TODO can we hardcode the generation here or should we pass it as parameter? + let template_generation = Generation::new(1); + let (template_index_part, template_generation, _) = + remote_timeline_client::download_template_index_part( + &self.remote_storage, + &template_tenant_id, + &template_timeline_id, + template_generation, + &self.cancel, + ) + .await + .unwrap(); + + tracing::info!( + "downloaded template index_part.json with generation {}", + template_generation.get_suffix() + ); + + // 2. create the timeline, initializing the index part of the new timeline with the layers from the template + let template_metadata = &template_index_part.metadata; + + let new_metadata = TimelineMetadata::new( + template_metadata.disk_consistent_lsn(), + None, + None, + Lsn(0), + template_metadata.latest_gc_cutoff_lsn(), + template_metadata.initdb_lsn(), + template_metadata.pg_version(), + ); + let (mut raw_timeline, _timeline_ctx) = self + .prepare_new_timeline( + new_timeline_id, + &new_metadata, + timeline_create_guard, + template_metadata.initdb_lsn(), + None, + None, + ctx, + ) + .await?; + + let _tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; + raw_timeline + .write(|unfinished_timeline| async move { + // TODO make this more sophisticated: maybe a variant of load_layer_map? + + let layers = template_index_part + .layer_metadata + .iter() + .map(|(layer_name, metadata)| { + // TODO support local sharing of layers + let mut metadata = metadata.clone(); + metadata.template_ttid = Some((template_tenant_id, template_timeline_id)); + Layer::for_evicted( + self.conf, + &unfinished_timeline, + layer_name.clone(), + metadata, + ) + }) + .collect(); + unfinished_timeline + .layers + .blocking_write() + .open_mut() + .map_err(|_| CreateTimelineError::ShuttingDown)? + .initialize_local_layers(layers, template_metadata.disk_consistent_lsn()); + fail::fail_point!("before-checkpoint-new-timeline", |_| { + Err(CreateTimelineError::Other(anyhow::anyhow!( + "failpoint before-checkpoint-new-timeline" + ))) + }); + + Ok(()) + }) + .await?; + + // All done! + let timeline = raw_timeline.finish_creation().await?; + + // Callers are responsible to wait for uploads to complete and for activating the timeline. + + // 4. finish + Ok(CreateTimelineResult::Created(timeline)) } /// The returned [`Arc`] is NOT in the [`TenantShard::timelines`] map until the import diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 21d68495f7..4ec67e8a4b 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -189,8 +189,9 @@ use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::{ - download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, - list_remote_tenant_shards, list_remote_timelines, + download_index_part, download_initdb_tar_zst, download_template_index_part, + download_tenant_manifest, is_temp_download_file, list_remote_tenant_shards, + list_remote_timelines, }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; @@ -1768,7 +1769,7 @@ impl RemoteTimelineClient { adopted_as: &Layer, cancel: &CancellationToken, ) -> anyhow::Result<()> { - let source_remote_path = remote_layer_path( + let source_remote_path = remote_layer_path_maybe_template( &self.tenant_shard_id.tenant_id, &adopted .get_timeline_id() @@ -1776,6 +1777,7 @@ impl RemoteTimelineClient { adopted.metadata().shard, &adopted.layer_desc().layer_name(), adopted.metadata().generation, + adopted.metadata().template_ttid, ); let target_remote_path = remote_layer_path( @@ -2684,6 +2686,31 @@ pub fn remote_layer_path( RemotePath::from_string(&path).expect("Failed to construct path") } +pub fn remote_layer_path_maybe_template( + tenant_id: &TenantId, + timeline_id: &TimelineId, + shard: ShardIndex, + layer_file_name: &LayerName, + generation: Generation, + template_ttid: Option<(TenantShardId, TimelineId)>, +) -> RemotePath { + if let Some((template_tenant_shard_id, template_timeline_id)) = template_ttid { + let template_tenant_id = template_tenant_shard_id.tenant_id; + let template_shard_id = template_tenant_shard_id.to_index(); + // Generation-aware key format + let path = format!( + "templates/{template_tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{template_timeline_id}/{1}{2}", + template_shard_id.get_suffix(), + layer_file_name, + generation.get_suffix() + ); + + RemotePath::from_string(&path).expect("Failed to construct path") + } else { + remote_layer_path(tenant_id, timeline_id, shard, layer_file_name, generation) + } +} + /// Returns true if a and b have the same layer path within a tenant/timeline. This is essentially /// remote_layer_path(a) == remote_layer_path(b) without the string allocations. /// @@ -2729,6 +2756,19 @@ pub fn remote_index_path( .expect("Failed to construct path") } +pub fn remote_template_index_path( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + generation: Generation, +) -> RemotePath { + RemotePath::from_string(&format!( + "templates/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", + IndexPart::FILE_NAME, + generation.get_suffix() + )) + .expect("Failed to construct path") +} + pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}" diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 84989e0fb8..3027af5816 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -29,7 +29,7 @@ use super::manifest::TenantManifest; use super::{ FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_tenant_manifest_path, + remote_initdb_preserved_archive_path, remote_template_index_path, remote_tenant_manifest_path, remote_tenant_manifest_prefix, remote_tenant_path, }; use crate::TEMP_FILE_SUFFIX; @@ -39,7 +39,9 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; use crate::tenant::Generation; -use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::remote_timeline_client::{ + remote_layer_path_maybe_template, remote_timelines_path, +}; use crate::tenant::storage_layer::LayerName; use crate::virtual_file; use crate::virtual_file::owned_buffers_io::write::FlushTaskError; @@ -68,12 +70,13 @@ pub async fn download_layer_file<'a>( let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let remote_path = remote_layer_path( + let remote_path = remote_layer_path_maybe_template( &tenant_shard_id.tenant_id, &timeline_id, layer_metadata.shard, layer_file_name, layer_metadata.generation, + layer_metadata.template_ttid, ); let (bytes_amount, temp_file) = download_retry( @@ -559,6 +562,35 @@ pub(crate) async fn download_index_part( .await } +/// index_part.json objects are suffixed with a generation number, so we cannot +/// directly GET the latest index part without doing some probing. +/// +/// In this function we probe for the most recent index in a generation <= our current generation. +/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md +pub(crate) async fn download_template_index_part( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + my_generation: Generation, + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + let index_prefix = remote_template_index_path(tenant_shard_id, timeline_id, Generation::none()); + download_generation_object( + storage, + tenant_shard_id, + Some(timeline_id), + my_generation, + "index_part", + index_prefix, + do_download_index_part, + parse_remote_index_path, + cancel, + ) + .await +} + pub(crate) async fn download_tenant_manifest( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index a5cd8989aa..8386b7c186 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -12,6 +12,7 @@ use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; +use utils::shard::TenantShardId; use super::is_same_remote_layer_path; use crate::tenant::Generation; @@ -233,6 +234,10 @@ pub struct LayerFileMetadata { #[serde(default = "ShardIndex::unsharded")] #[serde(skip_serializing_if = "ShardIndex::is_unsharded")] pub shard: ShardIndex, + + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub template_ttid: Option<(TenantShardId, TimelineId)>, } impl LayerFileMetadata { @@ -241,6 +246,7 @@ impl LayerFileMetadata { file_size, generation, shard, + template_ttid: None, } } /// Helper to get both generation and file size in a tuple