From c262390214109d46be2c230f1d52948e7134f76d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Dec 2022 12:39:50 +0200
Subject: [PATCH] Don't upload index file when GC doesn't remove anything.

I saw an excessive number of index file upload operations in
production, even when nothing on the timeline changes. It was because
our GC schedules index file upload if the GC cutoff LSN is advanced,
even if the GC had nothing else to do. The GC cutoff LSN marches
steadily forwards, even when there is no user activity on the
timeline, when the cutoff is determined by the time-based PITR
interval setting. To dial that down, only schedule index file upload
when GC is about to actually remove something.
---
 pageserver/src/tenant/timeline.rs         | 51 +++++++-------
 test_runner/regress/test_gc_aggressive.py | 86 ++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cc6583dcf6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2487,9 +2487,6 @@ impl Timeline {
             );
             write_guard.store_and_unlock(new_gc_cutoff).wait();
         }
-        // Persist the new GC cutoff value in the metadata file, before
-        // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
 
         info!("GC starting");
 
@@ -2600,19 +2597,33 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
-        // Actually delete the layers from disk and remove them from the map.
-        // (couldn't do this in the loop above, because you cannot modify a collection
-        // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
-        for doomed_layer in layers_to_remove {
-            let path = doomed_layer.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
-            layer_names_to_delete.push(doomed_layer.filename());
-            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer);
-            result.layers_removed += 1;
+        if !layers_to_remove.is_empty() {
+            // Persist the new GC cutoff value in the metadata file, before
+            // we actually remove anything.
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+
+            // Actually delete the layers from disk and remove them from the map.
+            // (couldn't do this in the loop above, because you cannot modify a collection
+            // while iterating it. BTreeMap::retain() would be another option)
+            let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
+            for doomed_layer in layers_to_remove {
+                let path = doomed_layer.local_path();
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+                layer_names_to_delete.push(doomed_layer.filename());
+                doomed_layer.delete()?;
+                layers.remove_historic(doomed_layer);
+                result.layers_removed += 1;
+            }
+
+            if result.layers_removed != 0 {
+                fail_point!("after-timeline-gc-removed-layers");
+            }
+
+            if let Some(remote_client) = &self.remote_client {
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            }
         }
 
         info!(
@@ -2620,14 +2631,6 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        if result.layers_removed != 0 {
-            fail_point!("after-timeline-gc-removed-layers");
-        }
-
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
-        }
-
         result.elapsed = now.elapsed()?;
         Ok(result)
     }
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 92855899f0..b9d012fa36 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,9 +2,17 @@ import asyncio
 import concurrent.futures
 import random
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
-from fixtures.types import TimelineId
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    Postgres,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 # Test configuration
@@ -43,6 +51,7 @@ async def gc(env: NeonEnv, timeline: TimelineId):
         while updates_performed < updates_to_perform:
             await loop.run_in_executor(pool, do_gc)
 
+
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
     workers = []
@@ -88,3 +97,76 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
         r = cur.fetchone()
         assert r is not None
         assert r == (num_rows, updates_to_perform)
+
+
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_gc_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    pg = env.postgres.create_start("test_gc_index_upload")
+
+    pageserver_http = env.pageserver.http_client()
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # Sanity check that the metric works
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    before = get_num_remote_ops("index", "upload")
+    assert before > 0
+
+    # Run many cycles of GC. Then check that the number of index files
+    # uploads didn't grow much. In particular we don't want to re-upload the
+    # index file on every GC iteration, when it has no work to do.
+    #
+    # On each iteration, we use a slightly smaller GC horizon, so that the GC
+    # at least needs to check if it has work to do.
+    for i in range(100):
+        cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
+        num_index_uploads = get_num_remote_ops("index", "upload")
+        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
+
+    after = num_index_uploads
+    log.info(f"{after-before} new index uploads during test")
+    assert after - before < 5