Don't upload index file when GC doesn't remove anything.

I saw an excessive number of index file upload operations in production, even when nothing on the timeline changes. It was because our GC schedules index file upload if the GC cutoff LSN is advanced, even if the GC had nothing else to do. The GC cutoff LSN marches steadily forwards, even when there is no user activity on the timeline, when the cutoff is determined by the time-based PITR interval setting. To dial that down, only schedule index file upload when GC is about to actually remove something.
2026-02-27 14:30:38 +00:00 · 2022-12-15 12:39:50 +02:00
parent 6dec85b19d
commit c262390214
2 changed files with 111 additions and 26 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2487,9 +2487,6 @@ impl Timeline {
            );
            write_guard.store_and_unlock(new_gc_cutoff).wait();
        }
-        // Persist the new GC cutoff value in the metadata file, before
-        // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

        info!("GC starting");

@@ -2600,19 +2597,33 @@ impl Timeline {
            layers_to_remove.push(Arc::clone(&l));
        }

-        // Actually delete the layers from disk and remove them from the map.
-        // (couldn't do this in the loop above, because you cannot modify a collection
-        // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
-        for doomed_layer in layers_to_remove {
-            let path = doomed_layer.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
-            layer_names_to_delete.push(doomed_layer.filename());
-            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer);
-            result.layers_removed += 1;
+        if !layers_to_remove.is_empty() {
+            // Persist the new GC cutoff value in the metadata file, before
+            // we actually remove anything.
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+
+            // Actually delete the layers from disk and remove them from the map.
+            // (couldn't do this in the loop above, because you cannot modify a collection
+            // while iterating it. BTreeMap::retain() would be another option)
+            let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
+            for doomed_layer in layers_to_remove {
+                let path = doomed_layer.local_path();
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+                layer_names_to_delete.push(doomed_layer.filename());
+                doomed_layer.delete()?;
+                layers.remove_historic(doomed_layer);
+                result.layers_removed += 1;
+            }
+
+            if result.layers_removed != 0 {
+                fail_point!("after-timeline-gc-removed-layers");
+            }
+
+            if let Some(remote_client) = &self.remote_client {
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            }
        }

        info!(
@@ -2620,14 +2631,6 @@ impl Timeline {
            result.layers_removed, new_gc_cutoff
        );

-        if result.layers_removed != 0 {
-            fail_point!("after-timeline-gc-removed-layers");
-        }
-
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
-        }
-
        result.elapsed = now.elapsed()?;
        Ok(result)
    }
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,9 +2,17 @@ import asyncio
 import concurrent.futures
 import random

+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
-from fixtures.types import TimelineId
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    Postgres,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar

 # Test configuration
@@ -43,6 +51,7 @@ async def gc(env: NeonEnv, timeline: TimelineId):
        while updates_performed < updates_to_perform:
            await loop.run_in_executor(pool, do_gc)

+
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
    workers = []
@@ -88,3 +97,76 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
        r = cur.fetchone()
        assert r is not None
        assert r == (num_rows, updates_to_perform)
+
+
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_gc_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    pg = env.postgres.create_start("test_gc_index_upload")
+
+    pageserver_http = env.pageserver.http_client()
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # Sanity check that the metric works
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    before = get_num_remote_ops("index", "upload")
+    assert before > 0
+
+    # Run many cycles of GC. Then check that the number of index files
+    # uploads didn't grow much. In particular we don't want to re-upload the
+    # index file on every GC iteration, when it has no work to do.
+    #
+    # On each iteration, we use a slightly smaller GC horizon, so that the GC
+    # at least needs to check if it has work to do.
+    for i in range(100):
+        cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
+        num_index_uploads = get_num_remote_ops("index", "upload")
+        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
+
+    after = num_index_uploads
+    log.info(f"{after-before} new index uploads during test")
+    assert after - before < 5