From 87dfa997345cc5a825aba4acc581edbf4806b4f7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 10 May 2022 09:55:14 -0400 Subject: [PATCH 1/3] Update layered_repository REAMDE (#1659) --- pageserver/src/layered_repository/README.md | 43 +++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 519478e417..70c571a507 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or there could be a long stream of updates to a single relation when data is bulk loaded, for example, or something in between. +``` Cloud Storage Page Server Safekeeper L1 L0 Memory WAL @@ -37,6 +38,7 @@ Cloud Storage Page Server Safekeeper +----+----+ +----+----+ | | | |EEEE| |EEEE|EEEE| +---+-----+ +----+ +----+----+ +``` In this illustration, WAL is received as a stream from the Safekeeper, from the right. It is immediately captured by the page server and stored quickly in @@ -47,7 +49,7 @@ the same page and relation close to each other. From the page server memory, whenever enough WAL has been accumulated, it is flushed to disk into a new L0 layer file, and the memory is released. -When enough L0 files have been accumulated, they are merged together rand sliced +When enough L0 files have been accumulated, they are merged together and sliced per key-space, producing a new set of files where each file contains a more narrow key range, but larger LSN range. @@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -.zenith/tenants//timelines. +`.zenith/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -130,8 +132,11 @@ range of LSN. image file: +``` 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 start key end key LSN +``` + The first parts define the key range that the layer covers. See pgdatadir_mapping.rs for how the key space is used. The last part is the LSN. @@ -140,8 +145,10 @@ delta file: Delta files are named similarly, but they cover a range of LSNs: +``` 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 start key end key start LSN end LSN +``` A delta file contains all the key-values in the key-range that were updated in the LSN range. If a key has not been modified, there is no trace of it in the @@ -151,7 +158,9 @@ delta layer. A delta layer file can cover a part of the overall key space, as in the previous example, or the whole key range like this: +``` 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051 +``` A file that covers the whole key range is called a L0 file (Level 0), while a file that covers only part of the key range is called a L1 file. The "level" of @@ -168,7 +177,9 @@ version, and how branching and GC works is still valid. The full path of a delta file looks like this: +``` .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +``` For simplicity, the examples below use a simplified notation for the paths. The tenant ID is left out, the timeline ID is replaced with @@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For example, a base image file at LSN 100 and a delta file between 100-200 for 'orders' table on 'main' branch is represented like this: +``` main/orders_100 main/orders_100_200 +``` # Creating layer files @@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end of WAL is currently at LSN 250. In this starting situation, you would have these files on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 main/customers_100 main/customers_100_200 main/customers_200 +``` In addition to those files, the recent changes between LSN 200 and the end of WAL at 250 are kept in memory. If the page server crashes, the @@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it at the next checkpoint. The new file will cover the "gap" from the last layer file, so the LSN ranges are always contiguous: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous: main/customers_200 main/customers_200_500 main/customers_500 +``` ## Reading page versions @@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs Imagine that a child branch is created at LSN 250: +``` @250 ----main--+--------------------------> \ +---child--------------> +``` Then, the 'orders' table is updated differently on the 'main' and 'child' branches. You now have this situation on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and child/orders_300 child/orders_300_400 child/orders_400 +``` Because the 'customers' table hasn't been modified on the child branch, there is no file for it there. If you request a page for it on @@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file you need to look at. For example, the history for the 'orders' table on the 'main' branch consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -301,10 +323,12 @@ on the 'main' branch consists of these files: main/orders_300 main/orders_300_400 main/orders_400 +``` And from the 'child' branch's point of view, it consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -313,6 +337,7 @@ files: child/orders_300 child/orders_300_400 child/orders_400 +``` The branch metadata includes the point where the child branch was created, LSN 250. If a page request comes with LSN 275, we read the @@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end of the branch is LSN 525, so that the GC horizon is currently at 525-150 = 375 +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at main/customers_100 main/customers_100_200 main/customers_200 +``` We can remove the following files because the end LSNs of those files are older than GC horizon 375, and there are more recent layer files for the table: +``` main/orders_100 DELETE main/orders_100_200 DELETE main/orders_200 DELETE @@ -374,8 +402,9 @@ table: main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION +``` -'main/customers_100_200' is old enough, but it cannot be +'main/customers_200' is old enough, but it cannot be removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of @@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: +``` main/orders_100 KEEP, NEEDED BY child BRANCH main/orders_100_200 KEEP, NEEDED BY child BRANCH main/orders_200 DELETE @@ -398,6 +428,7 @@ table is updated on the branch, you would have these files: main/customers_200 KEEP, NO NEWER VERSION child/customers_150_300 DELETE child/customers_300 KEEP, NO NEWER VERSION +``` In this situation, 'main/orders_100' and 'main/orders_100_200' cannot be removed, even though they are older than the GC horizon, because @@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed. If 'orders' is modified later on the 'child' branch, we will create a new base image and delta file for it on the child: +``` main/orders_100 main/orders_100_200 @@ -419,6 +451,7 @@ new base image and delta file for it on the child: child/customers_300 child/orders_150_400 child/orders_400 +``` After this, the 'main/orders_100' and 'main/orders_100_200' file could be removed. It is no longer needed by the child branch, because there @@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings. In principle, each relation can be checkpointed separately, i.e. the LSN ranges of the files don't need to line up. So this would be legal: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal: main/customers_250 main/customers_250_500 main/customers_500 +``` However, the code currently always checkpoints all relations together. So that situation doesn't arise in practice. @@ -468,11 +503,13 @@ does that. It could be useful, however, as a transient state when garbage collecting around branch points, or explicit recovery points. For example, if we start with this: +``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 +``` And there is a branch or explicit recovery point at LSN 150, we could replace 'main/orders_100_200' with 'main/orders_150' to keep a From 6cb14b4200429bc2eb50b5f9879918188965b156 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 10 May 2022 20:44:56 +0400 Subject: [PATCH 2/3] Optionally remove WAL on safekeepers without s3 offloading. And do that on staging, until offloading is merged. --- .circleci/ansible/production.hosts | 1 + .circleci/ansible/staging.hosts | 1 + .circleci/ansible/systemd/safekeeper.service | 2 +- safekeeper/src/bin/safekeeper.rs | 15 +++++++++++++++ safekeeper/src/lib.rs | 2 ++ safekeeper/src/remove_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 9 +++++++-- safekeeper/src/timeline.rs | 4 ++-- 8 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index f32b57154c..2ed8f517f7 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -15,3 +15,4 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 +safekeeper_enable_s3_offload = true diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 71166c531e..3ea815b907 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -16,3 +16,4 @@ console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 +safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index cac38d8756..55088db859 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 7e979840c2..d0df7093ff 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -115,6 +115,14 @@ fn main() -> Result<()> { .takes_value(true) .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) + .arg( + Arg::new("enable-s3-offload") + .long("enable-s3-offload") + .takes_value(true) + .default_value("true") + .default_missing_value("true") + .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -172,6 +180,13 @@ fn main() -> Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + // Seems like there is no better way to accept bool values explicitly in clap. + conf.s3_offload_enabled = arg_matches + .value_of("enable-s3-offload") + .unwrap() + .parse() + .context("failed to parse bool enable-s3-offload bool")?; + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f74e5be992..c848de9e71 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -53,6 +53,7 @@ pub struct SafeKeeperConf { pub my_id: ZNodeId, pub broker_endpoints: Option>, pub broker_etcd_prefix: String, + pub s3_offload_enabled: bool, } impl SafeKeeperConf { @@ -79,6 +80,7 @@ impl Default for SafeKeeperConf { my_id: ZNodeId(0), broker_endpoints: None, broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), + s3_offload_enabled: true, } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 9474f65e5f..3278d51bd3 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal() { + if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index b9264565dc..fff1c269b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -930,13 +930,18 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self) -> XLogSegNo { + pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { + let s3_offload_horizon = if s3_offload_enabled { + self.state.s3_wal_lsn + } else { + Lsn(u64::MAX) + }; let horizon_lsn = min( min( self.state.remote_consistent_lsn, self.state.peer_horizon_lsn, ), - self.state.s3_wal_lsn, + s3_offload_horizon, ); horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 140d6660ac..8b1072a54b 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -479,7 +479,7 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self) -> Result<()> { + pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { @@ -488,7 +488,7 @@ impl Timeline { if shared_state.sk.state.server.wal_seg_size == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(); + horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); From d710dff9756ca006ffb2bc7362f8137f5ca06f48 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 May 2022 16:28:00 +0300 Subject: [PATCH 3/3] Remove unnecessary Serialize/Deserialize traits from VecMap. It's never stored on disk. Let's be tidy. --- libs/utils/src/vec_map.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 558721c724..9953b447c8 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,11 +1,9 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; -use serde::{Deserialize, Serialize}; - /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug)] pub struct VecMap(Vec<(K, V)>); impl Default for VecMap {