pageserver: refine how we delete timelines after shard split (#8436)

## Problem Previously, when we do a timeline deletion, shards will delete layers that belong to an ancestor. That is not a correctness issue, because when we delete a timeline, we're always deleting it from all shards, and destroying data for that timeline is clearly fine. However, there exists a race where one shard might start doing this deletion while another shard has not yet received the deletion request, and might try to access an ancestral layer. This creates ambiguity over the "all layers referenced by my index should always exist" invariant, which is important to detecting and reporting corruption. Now that we have a GC mode for clearing up ancestral layers, we can rely on that to clean up such layers, and avoid deleting them right away. This makes things easier to reason about: there are now no cases where a shard will delete a layer that belongs to a ShardIndex other than itself. ## Summary of changes - Modify behavior of RemoteTimelineClient::delete_all - Add `test_scrubber_physical_gc_timeline_deletion` to exercise this case - Tweak AWS SDK config in the scrubber to enable retries. Motivated by seeing the test for this feature encounter some transient "service error" S3 errors (which are probably nothing to do with the changes in this PR)
2026-01-11 07:22:55 +00:00 · 2024-08-02 08:00:46 +01:00
parent e7477855b7
commit c53799044d
3 changed files with 97 additions and 1 deletions
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
+                .filter(|(_file_name, meta)| {
+                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
+                    // all shards anyway, we _could_ delete these, but
+                    // - it creates a potential race if other shards are still
+                    //   using the layers while this shard deletes them.
+                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
+                    //   these timelines are present but corrupt (their index exists but some layers don't)
+                    //
+                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
+                    meta.shard.shard_number == self.tenant_shard_id.shard_number
+                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
+                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,6 +16,7 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
+use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;
@@ -314,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }

 async fn init_s3_client(bucket_region: Region) -> Client {
+    let mut retry_config_builder = RetryConfigBuilder::new();
+
+    retry_config_builder
+        .set_max_attempts(Some(3))
+        .set_mode(Some(RetryMode::Adaptive));
+
    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
        .region(bucket_region)
+        .retry_config(retry_config_builder.build())
        .load()
        .await;
    Client::new(&config)
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
    # attach it, to drop any local state, then check it's still readable.
    workload.stop()
    drop_local_state(env, tenant_id)
-
    workload.validate()


+def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
+    """
+    When we delete a timeline after a shard split, the child shards do not directly delete the
+    layers in the ancestor shards.  They rely on the scrubber to clean up.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=None,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    # Make sure the original shard has some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    new_shard_count = 4
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+
+    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
+    #
+    # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
+    # doesn't know about it, and won't perceive its ancestors as ancestors.
+    other_timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        PgVersion.NOT_SET, tenant_id, other_timeline_id
+    )
+
+    # Write after split so that child shards have some indices in S3
+    workload.write_rows(100, upload=False)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+    # The timeline still exists in child shards and they reference its layers, so scrubbing
+    # now shouldn't delete anything.
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Delete the timeline
+    env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
+
+    # Subsequently doing physical GC should clean up the ancestor layers
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] > 0
+
+
 def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
    """
    Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards