From 77a73ff36fdb0ead58caa6180aa3262d1a27c616 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 14 Jun 2023 22:14:09 +0300 Subject: [PATCH] Fix build problems with create_timeline_replica --- control_plane/src/pageserver.rs | 2 + libs/remote_storage/src/lib.rs | 12 +- pageserver/ctl/src/main.rs | 3 + pageserver/src/tenant.rs | 143 ++++++++++-------- pageserver/src/tenant/config.rs | 8 +- pageserver/src/tenant/metadata.rs | 9 +- pageserver/src/tenant/mgr.rs | 2 +- .../src/tenant/storage_layer/filename.rs | 10 +- pageserver/src/tenant/timeline.rs | 1 + 9 files changed, 107 insertions(+), 83 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2ff09021e5..71faec9f2b 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -380,6 +380,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + master_region: settings.remove("master_region").map(|x| x.to_string()), }; // If tenant ID was not specified, generate one @@ -479,6 +480,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + master_region: settings.remove("master_region").map(|x| x.to_string()), } }; diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index b49c01179f..b33617af6d 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -342,10 +342,14 @@ impl Debug for S3Config { impl RemoteStorageConfig { pub fn in_region(&self, region: String) -> anyhow::Result { - if let AwsS3(config) = &self.storage { - let mut storage = config.clone(); - storage.region = region; - Ok(RemoteStorageConfig { storage, ..self }) + let self_clone = self.clone(); + if let RemoteStorageKind::AwsS3(config) = self_clone.storage { + let mut storage = config; + storage.bucket_region = region; + Ok(RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(storage), + ..self_clone + }) } else { bail!("Only AWS3 storage can be used in other region") } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 55db9eb7e7..7ef896d10d 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -142,6 +142,7 @@ fn handle_metadata( meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), meta.pg_version(), + meta.replica_lsn(), ); update_meta = true; } @@ -154,6 +155,7 @@ fn handle_metadata( meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), meta.pg_version(), + meta.replica_lsn(), ); update_meta = true; } @@ -166,6 +168,7 @@ fn handle_metadata( *latest_gc_cuttoff, meta.initdb_lsn(), meta.pg_version(), + meta.replica_lsn(), ); update_meta = true; } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6f254713ac..a5b32d0e0e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11,6 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! +use crate::repository::Key; use anyhow::{bail, Context}; use futures::FutureExt; use pageserver_api::models::TimelineState; @@ -60,12 +61,11 @@ use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; -use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; -use crate::tenant::storage_layer::DeltaLayer; -use crate::tenant::storage_layer::ImageLayer; -use crate::tenant::storage_layer::Layer; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; +use crate::tenant::remote_timeline_client::{ + MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, +}; +use crate::tenant::storage_layer::{range_eq, DeltaLayer, ImageLayer, Layer, LayerFileName}; use crate::InitializationOrder; use crate::virtual_file::VirtualFile; @@ -1294,6 +1294,7 @@ impl Tenant { initdb_lsn, initdb_lsn, pg_version, + None, ); self.prepare_timeline( new_timeline_id, @@ -1328,11 +1329,12 @@ impl Tenant { pub async fn create_timeline_replica( &self, timeline_id: TimelineId, - master_borker_endpoint: String, + master_broker_endpoint: String, ctx: &RequestContext, ) -> anyhow::Result<()> { let broker_client = - storage_broker::connect(master_broker_endpoint, conf.broker_keepalive_interval).await?; + storage_broker::connect(master_broker_endpoint, self.conf.broker_keepalive_interval)?; + let master_storage = self .master_storage .as_ref() @@ -1343,6 +1345,7 @@ impl Tenant { self.tenant_id, timeline_id, ); + let remote_storage = self .master_storage .as_ref() @@ -1363,7 +1366,6 @@ impl Tenant { // Download & parse index parts let mut part_downloads = JoinSet::new(); - let mut remote_index_and_client = HashMap::new(); for timeline_id in remote_timeline_ids { let client = RemoteTimelineClient::new( @@ -1409,41 +1411,54 @@ impl Tenant { } } - let (index_part, client) = remote_index_and_client - .get(timeline_id) - .expect("timeline found at master")?; + let (index_part, _client) = remote_index_and_client + .get(&timeline_id) + .expect("timeline found at master"); let mut timeline_metadata = index_part.parse_metadata().context("parse_metadata")?; - let layers: Vec<(LayerFileName, IndexLayerMetadata)> = index_part + + // Skip L0 layers because LSN range of L0 and L1 layers is the same and it is not possible + // to distinct master and replica layers by LSN. + let mut layer_metadata: HashMap = index_part .layer_metadata .iter() - .filter(|(fname, meta)| range_eq(&fname.get_key_range(), &(Key::MIN..Key::MAX))) - .sort_by(|(fname, meta)| fname.get_lsn_range().start) + .filter(|(fname, _meta)| !range_eq(&fname.get_key_range(), &(Key::MIN..Key::MAX))) + .map(|(fname, meta)| (fname.clone(), LayerFileMetadata::from(meta))) .collect(); - let replica_lsn = layers - .last() - .map_or(Lsn(0), |(fname, meta)| fname.get_lsn_range().last); - let mut layer_metadata: HashMap = layers.iter().collect(); + + // Let replic_lsn be the largest end LSN + let replica_lsn = layer_metadata + .keys() + .map(|fname| fname.get_lsn_range().start) + .max() + .unwrap_or(timeline_metadata.ancestor_lsn()); + let old_metadata = timeline_metadata.clone(); + + // Now collect layers of ancestor branches. We do not want to reconstruct exact branch + // hierarhy at replica, because i this case we need to maintais several timelines. + // Instead of it we just add all layers with start LSN < replica_lsn to the current timeline. while let Some(ancestor_id) = timeline_metadata.ancestor_timeline() { - let (index_part, client) = remote_index_and_client - .get(ancestor_id) - .expect("timeline found at master")?; + let (index_part, _client) = remote_index_and_client + .get(&ancestor_id) + .expect("timeline found at master"); for (fname, meta) in &index_part.layer_metadata { - if fname.get_lasn_range().start < replica_lsn { - layer_metadata.insert(fname, meta); + if !range_eq(&fname.get_key_range(), &(Key::MIN..Key::MAX)) + && fname.get_lsn_range().start < replica_lsn + { + layer_metadata.insert(fname.clone(), LayerFileMetadata::from(meta)); } } timeline_metadata = index_part.parse_metadata().context("parse_metadata")?; } let new_metadata = TimelineMetadata::new( - old_metadata.disk_consistent_lsn, - old_metadata.prev_record_lsn, + old_metadata.disk_consistent_lsn(), + old_metadata.prev_record_lsn(), None, Lsn::INVALID, - old_metadata.latest_gc_cutoff_lsn, - old_metadata.initdb_lsn, - old_metadata.pg_version, - replica_lsn, + old_metadata.latest_gc_cutoff_lsn(), + old_metadata.initdb_lsn(), + old_metadata.pg_version(), + Some(replica_lsn), ); // Initialize data directories for new timeline @@ -1452,54 +1467,49 @@ impl Tenant { .context("Failed to create new timeline directory")?; // Save timeline metadata - save_metadata(self.conf, timeline_id, self.tenant_id, new_metadata, true) + save_metadata(self.conf, timeline_id, self.tenant_id, &new_metadata, true) .context("Failed to create timeline metadata")?; + // construct new index_part.json with combined list of layers + let index_part = IndexPart::new( + layer_metadata, + old_metadata.disk_consistent_lsn(), + new_metadata.to_bytes()?, + ); + + remote_client.init_upload_queue(&index_part)?; + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), - new_metadata, + &new_metadata, None, // we do not need to restore branches hierarhy at replica timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), - remote_client, - master_client, + Some(remote_client), + Some(master_client), Some(replica_lsn), - old_metadata.pg_version, + old_metadata.pg_version(), None, // no need to calcuate logical size at replica None, ); - // construct new index_part.json with combined list of layers - let index_part = IndexPart::new( - layer_metadata, - old_metadata.disk_consistent_lsn, - new_metadata.to_bytes()?, - ); + + /* Do we need to perform explicit upload? // Upload this index_part.json to S3 bucket - remote_client.upload_index_part( + upload_index_part( self.conf, - &self.remote_storage, - self.telnant_id, + &remote_storage, + self.tenant_id, timeline_id, &index_part, - )?; + ) + .await?; + */ - // Start background works for this timelinw - timeline.activate(broker_client, true); + // Start background works for this timeline + timeline.activate(broker_client, None, ctx); - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; Ok(()) } @@ -2333,12 +2343,12 @@ fn tree_sort_timelines( impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - *self.tenant_conf.read().unwrap() + self.tenant_conf.read().unwrap().clone() } pub fn effective_config(&self) -> TenantConf { self.tenant_specific_overrides() - .merge(self.conf.default_tenant_conf) + .merge(self.conf.default_tenant_conf.clone()) } pub fn get_checkpoint_distance(&self) -> u64 { @@ -2468,7 +2478,7 @@ impl Tenant { Arc::clone(&self.walredo_mgr), remote_client, master_client, - new_metadata.replica_lsn, + new_metadata.replica_lsn(), pg_version, initial_logical_size_can_start.cloned(), initial_logical_size_attempt.cloned(), @@ -2485,10 +2495,11 @@ impl Tenant { ) -> Tenant { let (state, mut rx) = watch::channel(state); - let master_storage = if let Some(remote_storage_config) = conf.remote_storage_config { + let master_storage = if let Some(remote_storage_config) = &conf.remote_storage_config { if let Some(region) = &tenant_conf.master_region { - let master_storage_config = remote_storage_config.in_region(region.clone()); - Some(GenericRemoteStorage::from_config(master_storage_config)?) + let master_storage_config = + remote_storage_config.in_region(region.clone()).unwrap(); + Some(GenericRemoteStorage::from_config(&master_storage_config).unwrap()) } else { None } @@ -2956,6 +2967,7 @@ impl Tenant { *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, + None, // no branches at replica ); let new_timeline = { @@ -3043,6 +3055,7 @@ impl Tenant { pgdata_lsn, pgdata_lsn, pg_version, + None, ); let raw_timeline = self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?; diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 2b18606b85..be7d427fb9 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -45,7 +45,7 @@ pub mod defaults { } /// Per-tenant configuration options -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct TenantConf { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the @@ -106,7 +106,7 @@ pub struct TenantConf { /// Same as TenantConf, but this struct preserves the information about /// which parameters are set and which are not. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -254,7 +254,7 @@ impl TenantConfOpt { .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), - master_region: self.master_region, + master_region: self.master_region.clone(), } } } @@ -388,7 +388,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { ); } tenant_conf.gc_feedback = request_data.gc_feedback; - tenant_conf.master_region = request_data.master_region; + tenant_conf.master_region = request_data.master_region.clone(); Ok(tenant_conf) } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 198b87ffc6..e3b663a8f8 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -25,9 +25,6 @@ use crate::virtual_file::VirtualFile; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 5; -/// Previous supported format versions. -const METADATA_OLD_FORMAT_VERSION: u16 = 3; - /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. /// /// This is the same assumption that PostgreSQL makes with the control file, @@ -263,6 +260,10 @@ impl TimelineMetadata { pub fn pg_version(&self) -> u32 { self.body.pg_version } + + pub fn replica_lsn(&self) -> Option { + self.body.replica_lsn + } } /// Save timeline metadata to file @@ -366,7 +367,7 @@ mod tests { hdr: TimelineMetadataHeader { checksum: 0, size: 0, - format_version: METADATA_OLD_FORMAT_VERSION, + format_version: 3, }, body: TimelineMetadataBodyV1 { disk_consistent_lsn: Lsn(0x200), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7e123c3fbd..5b35184da3 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -348,7 +348,7 @@ pub async fn set_new_tenant_config( Tenant::persist_tenant_config( &tenant.tenant_id(), &tenant_config_path, - new_tenant_conf, + new_tenant_conf.clone(), false, ) .map_err(SetNewTenantConfigError::Persist)?; diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index 7f48378e16..85211c81a9 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -215,16 +215,16 @@ impl LayerFileName { Self::Delta(fname) => fname.to_string(), } } - pub fn lsn_range(&self) -> Range { + pub fn get_lsn_range(&self) -> Range { match self { Self::Image(fname) => fname.lsn..fname.lsn + 1, - Self::Delta(fname) => fname.lsn_range, + Self::Delta(fname) => fname.lsn_range.clone(), } } - pub fn key_range(&self) -> Range { + pub fn get_key_range(&self) -> Range { match self { - Self::Image(fname) => fname.key_range, - Self::Delta(fname) => fname.key_range, + Self::Image(fname) => fname.key_range.clone(), + Self::Delta(fname) => fname.key_range.clone(), } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index db2637d9db..4859abe700 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2954,6 +2954,7 @@ impl Timeline { *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, self.pg_version, + self.replica_lsn, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!(