From 87dfa997345cc5a825aba4acc581edbf4806b4f7 Mon Sep 17 00:00:00 2001
From: Thang Pham <thang@neon.tech>
Date: Tue, 10 May 2022 09:55:14 -0400
Subject: [PATCH 1/3] Update layered_repository REAMDE (#1659)

---
 pageserver/src/layered_repository/README.md | 43 +++++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)
diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md
index 519478e417..70c571a507 100644
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or
 there could be a long stream of updates to a single relation when data is bulk
 loaded, for example, or something in between.
 
+```
 Cloud Storage                   Page Server                           Safekeeper
                         L1               L0             Memory            WAL
 
@@ -37,6 +38,7 @@ Cloud Storage                   Page Server                           Safekeeper
 +----+----+          +----+----+      |   |     |
 |EEEE|               |EEEE|EEEE|      +---+-----+
 +----+               +----+----+
+```
 
 In this illustration, WAL is received as a stream from the Safekeeper, from the
 right.  It is immediately captured by the page server and stored quickly in
@@ -47,7 +49,7 @@ the same page and relation close to each other.
 From the page server memory, whenever enough WAL has been accumulated, it is flushed
 to disk into a new L0 layer file, and the memory is released.
 
-When enough L0 files have been accumulated, they are merged together rand sliced
+When enough L0 files have been accumulated, they are merged together and sliced
 per key-space, producing a new set of files where each file contains a more
 narrow key range, but larger LSN range.
 
@@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
 a range of LSNs (or a single LSN, in case of image layers). You can think of it
 as a rectangle in the two-dimensional key-LSN space. The layer files for each
 timeline are stored in the timeline's subdirectory under
-.zenith/tenants/<tenantid>/timelines.
+`.zenith/tenants/<tenantid>/timelines`.
 
 There are two kind of layer files: images, and delta layers. An image file
 contains a snapshot of all keys at a particular LSN, whereas a delta file
@@ -130,8 +132,11 @@ range of LSN.
 
 image file:
 
+```
     000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
               start key                          end key                           LSN
+```
+
 
 The first parts define the key range that the layer covers. See
 pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
@@ -140,8 +145,10 @@ delta file:
 
 Delta files are named similarly, but they cover a range of LSNs:
 
+```
     000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
               start key                          end key                          start LSN     end LSN
+```
 
 A delta file contains all the key-values in the key-range that were updated in
 the LSN range. If a key has not been modified, there is no trace of it in the
@@ -151,7 +158,9 @@ delta layer.
 A delta layer file can cover a part of the overall key space, as in the previous
 example, or the whole key range like this:
 
+```
     000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
+```
 
 A file that covers the whole key range is called a L0 file (Level 0), while a
 file that covers only part of the key range is called a L1 file. The "level" of
@@ -168,7 +177,9 @@ version, and how branching and GC works is still valid.
 
 The full path of a delta file looks like this:
 
+```
     .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+```
 
 For simplicity, the examples below use a simplified notation for the
 paths.  The tenant ID is left out, the timeline ID is replaced with
@@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For
 example, a base image file at LSN 100 and a delta file between 100-200
 for 'orders' table on 'main' branch is represented like this:
 
+```
     main/orders_100
     main/orders_100_200
+```
 
 
 # Creating layer files
@@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
 of WAL is currently at LSN 250. In this starting situation, you would
 have these files on disk:
 
+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```
 
 In addition to those files, the recent changes between LSN 200 and the
 end of WAL at 250 are kept in memory. If the page server crashes, the
@@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it
 at the next checkpoint. The new file will cover the "gap" from the
 last layer file, so the LSN ranges are always contiguous:
 
+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous:
 	main/customers_200
 	main/customers_200_500
 	main/customers_500
+```
 
 ## Reading page versions
 
@@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs
 
 Imagine that a child branch is created at LSN 250:
 
+```
             @250
     ----main--+-------------------------->
                \
                 +---child-------------->
+```
 
 
 Then, the 'orders' table is updated differently on the 'main' and
 'child' branches. You now have this situation on disk:
 
+```
     main/orders_100
     main/orders_100_200
     main/orders_200
@@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and
     child/orders_300
     child/orders_300_400
     child/orders_400
+```
 
 Because the 'customers' table hasn't been modified on the child
 branch, there is no file for it there. If you request a page for it on
@@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file
 you need to look at. For example, the history for the 'orders' table
 on the 'main' branch consists of these files:
 
+```
     main/orders_100
     main/orders_100_200
     main/orders_200
@@ -301,10 +323,12 @@ on the 'main' branch consists of these files:
     main/orders_300
     main/orders_300_400
     main/orders_400
+```
 
 And from the 'child' branch's point of view, it consists of these
 files:
 
+```
     main/orders_100
     main/orders_100_200
     main/orders_200
@@ -313,6 +337,7 @@ files:
     child/orders_300
     child/orders_300_400
     child/orders_400
+```
 
 The branch metadata includes the point where the child branch was
 created, LSN 250. If a page request comes with LSN 275, we read the
@@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end
 of the branch is LSN 525, so that the GC horizon is currently at
 525-150 = 375
 
+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```
 
 We can remove the following files because the end LSNs of those files are
 older than GC horizon 375, and there are more recent layer files for the
 table:
 
+```
 	main/orders_100       DELETE
 	main/orders_100_200   DELETE
 	main/orders_200       DELETE
@@ -374,8 +402,9 @@ table:
 	main/customers_100      DELETE
 	main/customers_100_200  DELETE
 	main/customers_200      KEEP, NO NEWER VERSION
+```
 
-'main/customers_100_200' is old enough, but it cannot be
+'main/customers_200' is old enough, but it cannot be
 removed because there is no newer layer file for the table.
 
 Things get slightly more complicated with multiple branches. All of
@@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches.
 For example, if child branch is created at LSN 150, and the 'customers'
 table is updated on the branch, you would have these files:
 
+```
 	main/orders_100        KEEP, NEEDED BY child BRANCH
 	main/orders_100_200    KEEP, NEEDED BY child BRANCH
 	main/orders_200        DELETE
@@ -398,6 +428,7 @@ table is updated on the branch, you would have these files:
 	main/customers_200       KEEP, NO NEWER VERSION
 	child/customers_150_300  DELETE
 	child/customers_300      KEEP, NO NEWER VERSION
+```
 
 In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
 be removed, even though they are older than the GC horizon, because
@@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed.
 If 'orders' is modified later on the 'child' branch, we will create a
 new base image and delta file for it on the child:
 
+```
 	main/orders_100
 	main/orders_100_200
 
@@ -419,6 +451,7 @@ new base image and delta file for it on the child:
 	child/customers_300
 	child/orders_150_400
 	child/orders_400
+```
 
 After this, the 'main/orders_100' and 'main/orders_100_200' file could
 be removed. It is no longer needed by the child branch, because there
@@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings.
 In principle, each relation can be checkpointed separately, i.e. the
 LSN ranges of the files don't need to line up. So this would be legal:
 
+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal:
 	main/customers_250
 	main/customers_250_500
 	main/customers_500
+```
 
 However, the code currently always checkpoints all relations together.
 So that situation doesn't arise in practice.
@@ -468,11 +503,13 @@ does that.  It could be useful, however, as a transient state when
 garbage collecting around branch points, or explicit recovery
 points. For example, if we start with this:
 
+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/orders_200_300
 	main/orders_300
+```
 
 And there is a branch or explicit recovery point at LSN 150, we could
 replace 'main/orders_100_200' with 'main/orders_150' to keep a

From 6cb14b4200429bc2eb50b5f9879918188965b156 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 10 May 2022 20:44:56 +0400
Subject: [PATCH 2/3] Optionally remove WAL on safekeepers without s3
 offloading.

And do that on staging, until offloading is merged.
---
 .circleci/ansible/production.hosts           |  1 +
 .circleci/ansible/staging.hosts              |  1 +
 .circleci/ansible/systemd/safekeeper.service |  2 +-
 safekeeper/src/bin/safekeeper.rs             | 15 +++++++++++++++
 safekeeper/src/lib.rs                        |  2 ++
 safekeeper/src/remove_wal.rs                 |  2 +-
 safekeeper/src/safekeeper.rs                 |  9 +++++++--
 safekeeper/src/timeline.rs                   |  4 ++--
 8 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts
index f32b57154c..2ed8f517f7 100644
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -15,3 +15,4 @@ console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
 etcd_endpoints        = etcd-release.local:2379
+safekeeper_enable_s3_offload = true
diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts
index 71166c531e..3ea815b907 100644
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -16,3 +16,4 @@ console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
 etcd_endpoints        = etcd-staging.local:2379
+safekeeper_enable_s3_offload = false
diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service
index cac38d8756..55088db859 100644
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 7e979840c2..d0df7093ff 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -115,6 +115,14 @@ fn main() -> Result<()> {
             .takes_value(true)
             .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
         )
+        .arg(
+            Arg::new("enable-s3-offload")
+                .long("enable-s3-offload")
+                .takes_value(true)
+                .default_value("true")
+                .default_missing_value("true")
+                .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."),
+        )
         .get_matches();
 
     if let Some(addr) = arg_matches.value_of("dump-control-file") {
@@ -172,6 +180,13 @@ fn main() -> Result<()> {
         conf.broker_etcd_prefix = prefix.to_string();
     }
 
+    // Seems like there is no better way to accept bool values explicitly in clap.
+    conf.s3_offload_enabled = arg_matches
+        .value_of("enable-s3-offload")
+        .unwrap()
+        .parse()
+        .context("failed to parse bool enable-s3-offload bool")?;
+
     start_safekeeper(conf, given_id, arg_matches.is_present("init"))
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index f74e5be992..c848de9e71 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -53,6 +53,7 @@ pub struct SafeKeeperConf {
     pub my_id: ZNodeId,
     pub broker_endpoints: Option<Vec<Url>>,
     pub broker_etcd_prefix: String,
+    pub s3_offload_enabled: bool,
 }
 
 impl SafeKeeperConf {
@@ -79,6 +80,7 @@ impl Default for SafeKeeperConf {
             my_id: ZNodeId(0),
             broker_endpoints: None,
             broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(),
+            s3_offload_enabled: true,
         }
     }
 }
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 9474f65e5f..3278d51bd3 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
         let active_tlis = GlobalTimelines::get_active_timelines();
         for zttid in &active_tlis {
             if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
-                if let Err(e) = tli.remove_old_wal() {
+                if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) {
                     warn!(
                         "failed to remove WAL for tenant {} timeline {}: {}",
                         tli.zttid.tenant_id, tli.zttid.timeline_id, e
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index b9264565dc..fff1c269b6 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -930,13 +930,18 @@ where
     /// offloading.
     /// While it is safe to use inmem values for determining horizon,
     /// we use persistent to make possible normal states less surprising.
-    pub fn get_horizon_segno(&self) -> XLogSegNo {
+    pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo {
+        let s3_offload_horizon = if s3_offload_enabled {
+            self.state.s3_wal_lsn
+        } else {
+            Lsn(u64::MAX)
+        };
         let horizon_lsn = min(
             min(
                 self.state.remote_consistent_lsn,
                 self.state.peer_horizon_lsn,
             ),
-            self.state.s3_wal_lsn,
+            s3_offload_horizon,
         );
         horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 140d6660ac..8b1072a54b 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -479,7 +479,7 @@ impl Timeline {
         shared_state.sk.wal_store.flush_lsn()
     }
 
-    pub fn remove_old_wal(&self) -> Result<()> {
+    pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> {
         let horizon_segno: XLogSegNo;
         let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
         {
@@ -488,7 +488,7 @@ impl Timeline {
             if shared_state.sk.state.server.wal_seg_size == 0 {
                 return Ok(());
             }
-            horizon_segno = shared_state.sk.get_horizon_segno();
+            horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled);
             remover = shared_state.sk.wal_store.remove_up_to();
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(());

From d710dff9756ca006ffb2bc7362f8137f5ca06f48 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 10 May 2022 16:28:00 +0300
Subject: [PATCH 3/3] Remove unnecessary Serialize/Deserialize traits from
 VecMap.

It's never stored on disk. Let's be tidy.
---
 libs/utils/src/vec_map.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 558721c724..9953b447c8 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,11 +1,9 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
 
-use serde::{Deserialize, Serialize};
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-#[derive(Clone, Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug)]
 pub struct VecMap<K, V>(Vec<(K, V)>);
 
 impl<K, V> Default for VecMap<K, V> {