From e6c82c960958159df0641e2d5d504860ed61bcec Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 17 Jan 2022 18:59:34 +0300 Subject: [PATCH] Add max_image_layers and image_layer_generation_threshold parameters to config and rewrite criteria of image layer generation --- pageserver/src/config.rs | 35 +++++++++++++++++++ pageserver/src/layered_repository.rs | 25 ++++++++----- .../src/layered_repository/delta_layer.rs | 14 ++++++++ .../src/layered_repository/image_layer.rs | 5 +++ .../src/layered_repository/inmemory_layer.rs | 25 ++++++------- .../src/layered_repository/layer_map.rs | 10 +++--- .../src/layered_repository/storage_layer.rs | 5 ++- 7 files changed, 91 insertions(+), 28 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e0d907f6b..ea0059a42f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -43,6 +43,9 @@ pub mod defaults { pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + pub const DEFAULT_MAX_DELTA_LAYERS: usize = 10; + pub const DEFAULT_IMAGE_LAYER_GENERATION_THRESHOLD: usize = 50; + /// /// Default built-in configuration file. /// @@ -90,6 +93,21 @@ pub struct PageServerConf { pub page_cache_size: usize, pub max_file_descriptors: usize, + // + // Minimal total size of delta layeres which triggers generation of image layer by checkpointer. + // It is specified as percent of maximal sigment size (RELISH_SEG_SIZE). + // I.e. it means that checkpoint will create image layer in addition to delta layer only when total size + // of delta layers since last image layer exceeds specified percent of segment size. + // + pub image_layer_generation_threshold: usize, + + // + // Maximal number of delta layers which can be stored before image layere should be generated. + // The garbage collector needs image layers in order to delete files. + // If this number is too large it can result in too many small files on disk. + // + pub max_delta_layers: usize, + // Repository directory, relative to current working directory. // Normally, the page server changes the current working directory // to the repository, and 'workdir' is always '.'. But we don't do @@ -228,6 +246,9 @@ impl PageServerConf { page_cache_size: DEFAULT_PAGE_CACHE_SIZE, max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS, + max_delta_layers: DEFAULT_MAX_DELTA_LAYERS, + image_layer_generation_threshold: DEFAULT_IMAGE_LAYER_GENERATION_THRESHOLD, + pg_distrib_dir: PathBuf::new(), auth_validation_public_key_path: None, auth_type: AuthType::Trust, @@ -250,6 +271,10 @@ impl PageServerConf { "max_file_descriptors" => { conf.max_file_descriptors = parse_toml_u64(key, item)? as usize } + "max_delta_layers" => conf.max_delta_layers = parse_toml_u64(key, item)? as usize, + "image_layer_generation_threshold" => { + conf.image_layer_generation_threshold = parse_toml_u64(key, item)? as usize + } "pg_distrib_dir" => { conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?) } @@ -379,6 +404,8 @@ impl PageServerConf { gc_period: Duration::from_secs(10), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, + max_delta_layers: defaults::DEFAULT_MAX_DELTA_LAYERS, + image_layer_generation_threshold: defaults::DEFAULT_IMAGE_LAYER_GENERATION_THRESHOLD, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), superuser: "zenith_admin".to_string(), @@ -450,6 +477,9 @@ gc_horizon = 222 page_cache_size = 444 max_file_descriptors = 333 +max_delta_layers = 10 +image_layer_generation_threshold = 50 + # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' @@ -480,6 +510,9 @@ initial_superuser_name = 'zzzz' superuser: defaults::DEFAULT_SUPERUSER.to_string(), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, + max_delta_layers: defaults::DEFAULT_MAX_DELTA_LAYERS, + image_layer_generation_threshold: + defaults::DEFAULT_IMAGE_LAYER_GENERATION_THRESHOLD, workdir, pg_distrib_dir, auth_type: AuthType::Trust, @@ -521,6 +554,8 @@ initial_superuser_name = 'zzzz' superuser: "zzzz".to_string(), page_cache_size: 444, max_file_descriptors: 333, + max_delta_layers: 10, + image_layer_generation_threshold: 50, workdir, pg_distrib_dir, auth_type: AuthType::Trust, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 3adfbc7bf7..9e05af4d5e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1598,7 +1598,7 @@ impl LayeredTimeline { Ok(()) } - fn evict_layer(&self, layer_id: LayerId, reconstruct_pages: bool) -> Result> { + fn evict_layer(&self, layer_id: LayerId, mut reconstruct_pages: bool) -> Result> { // Mark the layer as no longer accepting writes and record the end_lsn. // This happens in-place, no new layers are created now. // We call `get_last_record_lsn` again, which may be different from the @@ -1612,12 +1612,20 @@ impl LayeredTimeline { let global_layer_map = GLOBAL_LAYER_MAP.read().unwrap(); if let Some(oldest_layer) = global_layer_map.get(&layer_id) { let last_lsn = self.get_last_record_lsn(); - // Count number of layers only if we nned this information: when creation of image layer was not prohibited - let n_delta_layers = if reconstruct_pages { - layers.count_delta_layers(oldest_layer.get_seg_tag(), last_lsn) - } else { - 0 - }; + // Avoid creation of image layers if there are not so much deltas + if reconstruct_pages && oldest_layer.get_seg_tag().rel.is_blocky() { + let (n_delta_layers, total_delta_size) = + layers.count_delta_layers(oldest_layer.get_seg_tag(), last_lsn)?; + let logical_segment_size = + oldest_layer.get_seg_size(last_lsn)? as u64 * BLCKSZ as u64; + let physical_deltas_size = total_delta_size + oldest_layer.get_physical_size()?; + if logical_segment_size * self.conf.image_layer_generation_threshold as u64 + > physical_deltas_size * 100 + && n_delta_layers < self.conf.max_delta_layers + { + reconstruct_pages = false; + } + } drop(global_layer_map); oldest_layer.freeze(last_lsn); @@ -1631,8 +1639,7 @@ impl LayeredTimeline { drop(layers); drop(write_guard); - let new_historics = - oldest_layer.write_to_disk(self, reconstruct_pages, n_delta_layers)?; + let new_historics = oldest_layer.write_to_disk(self, reconstruct_pages)?; write_guard = self.write_lock.lock().unwrap(); layers = self.layers.lock().unwrap(); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 41692fcac6..92a5742f58 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -161,6 +161,14 @@ pub struct DeltaLayerInner { } impl DeltaLayerInner { + fn get_physical_size(&self) -> Result { + Ok(if let Some(book) = &self.book { + book.chapter_reader(PAGE_VERSIONS_CHAPTER)?.len() + } else { + 0 + }) + } + fn get_seg_size(&self, lsn: Lsn) -> Result { // Scan the VecMap backwards, starting from the given entry. let slice = self @@ -289,6 +297,12 @@ impl Layer for DeltaLayer { } } + // Get physical size of the layer + fn get_physical_size(&self) -> Result { + // TODO: is it actually necessary to load layer to get it's size? + self.load()?.get_physical_size() + } + /// Get size of the relation at given LSN fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index ecfb8c73b0..d4aa7e8723 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -201,6 +201,11 @@ impl Layer for ImageLayer { } } + // Get physical size of the layer + fn get_physical_size(&self) -> Result { + Ok(self.get_seg_size(Lsn(0))? as u64 * BLOCK_SIZE as u64) + } + /// Does this segment exist at given LSN? fn get_seg_exists(&self, _lsn: Lsn) -> Result { Ok(true) diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index be5f17c316..ea7cd80b84 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -20,7 +20,6 @@ use crate::{ZTenantId, ZTimelineId}; use anyhow::{ensure, Result}; use bytes::Bytes; use log::*; -use postgres_ffi::pg_constants::BLCKSZ; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use zenith_utils::lsn::Lsn; @@ -28,10 +27,6 @@ use zenith_utils::vec_map::VecMap; use super::page_versions::PageVersions; -// The garbage collector needs image layers in order to delete files. -// If this number is too large it can result in too many small files on disk. -const MAX_DELTA_LAYERS: usize = 10; - pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -85,6 +80,10 @@ impl InMemoryLayerInner { assert!(self.end_lsn.is_none()); } + fn get_physical_size(&self) -> u64 { + self.page_versions.size() + } + fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { // Scan the BTreeMap backwards, starting from the given entry. let slice = self.seg_sizes.slice_range(..=lsn); @@ -226,7 +225,12 @@ impl Layer for InMemoryLayer { } } - /// Get size of the relation at given LSN + // Get physical size of the layer + fn get_physical_size(&self) -> Result { + Ok(self.inner.read().unwrap().get_physical_size() as u64) + } + + /// Get logical size of the relation at given LSN fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); ensure!( @@ -594,7 +598,6 @@ impl InMemoryLayer { &self, timeline: &LayeredTimeline, reconstruct_pages: bool, - n_delta_layers: usize, ) -> Result { trace!( "write_to_disk {} get_end_lsn is {}", @@ -621,13 +624,7 @@ impl InMemoryLayer { // Figure out if we should create a delta layer, image layer, or both. let image_lsn: Option; let delta_end_lsn: Option; - if self.is_dropped() - || !reconstruct_pages - || (self.seg.rel.is_blocky() - && self.get_seg_size(end_lsn_inclusive)? as u64 * BLCKSZ as u64 - > inner.page_versions.size() * 2 - && n_delta_layers < MAX_DELTA_LAYERS) - { + if self.is_dropped() || !reconstruct_pages { // Create just a delta layer containing all the // changes up to and including the drop. delta_end_lsn = Some(end_lsn_exclusive); diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 7f1bb9528f..382ac12ca7 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -199,11 +199,11 @@ impl LayerMap { } } - pub fn count_delta_layers(&self, seg: SegmentTag, lsn: Lsn) -> usize { + pub fn count_delta_layers(&self, seg: SegmentTag, lsn: Lsn) -> Result<(usize, u64)> { if let Some(segentry) = self.segs.get(&seg) { segentry.count_delta_layers(lsn) } else { - 0 + Ok((0, 0)) } } @@ -330,16 +330,18 @@ impl SegEntry { // Count number of delta layers preceeding specified `lsn`. // Perform backward iteration from exclusive upper bound until image layer is reached. - pub fn count_delta_layers(&self, lsn: Lsn) -> usize { + pub fn count_delta_layers(&self, lsn: Lsn) -> Result<(usize, u64)> { let mut count: usize = 0; + let mut total_size: u64 = 0; let mut iter = self.historic.iter_older(lsn); while let Some(layer) = iter.next_back() { if !layer.is_incremental() { break; } count += 1; + total_size += layer.get_physical_size()?; } - count + Ok((count, total_size)) } // Set new open layer for a SegEntry. diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 99fdaa6845..de9c63e330 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -154,12 +154,15 @@ pub trait Layer: Send + Sync { reconstruct_data: &mut PageReconstructData, ) -> Result; - /// Return size of the segment at given LSN. (Only for blocky relations.) + /// Return logical size of the segment at given LSN. (Only for blocky relations.) fn get_seg_size(&self, lsn: Lsn) -> Result; /// Does the segment exist at given LSN? Or was it dropped before it. fn get_seg_exists(&self, lsn: Lsn) -> Result; + // Get physical size of the layer + fn get_physical_size(&self) -> Result; + /// Does this layer only contain some data for the segment (incremental), /// or does it contain a version of every page? This is important to know /// for garbage collecting old layers: an incremental layer depends on