pageserver: refine how we delete timelines after shard split (#8436)

## Problem

Previously, when we do a timeline deletion, shards will delete layers
that belong to an ancestor. That is not a correctness issue, because
when we delete a timeline, we're always deleting it from all shards, and
destroying data for that timeline is clearly fine.

However, there exists a race where one shard might start doing this
deletion while another shard has not yet received the deletion request,
and might try to access an ancestral layer. This creates ambiguity over
the "all layers referenced by my index should always exist" invariant,
which is important to detecting and reporting corruption.

Now that we have a GC mode for clearing up ancestral layers, we can rely
on that to clean up such layers, and avoid deleting them right away.
This makes things easier to reason about: there are now no cases where a
shard will delete a layer that belongs to a ShardIndex other than
itself.

## Summary of changes

- Modify behavior of RemoteTimelineClient::delete_all
- Add `test_scrubber_physical_gc_timeline_deletion` to exercise this
case
- Tweak AWS SDK config in the scrubber to enable retries. Motivated by
seeing the test for this feature encounter some transient "service
error" S3 errors (which are probably nothing to do with the changes in
this PR)
This commit is contained in:
John Spray
2024-08-02 08:00:46 +01:00
committed by GitHub
parent e7477855b7
commit c53799044d
3 changed files with 97 additions and 1 deletions

View File

@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
.dirty
.layer_metadata
.drain()
.filter(|(_file_name, meta)| {
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
// all shards anyway, we _could_ delete these, but
// - it creates a potential race if other shards are still
// using the layers while this shard deletes them.
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
// these timelines are present but corrupt (their index exists but some layers don't)
//
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
})
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_shard_id.tenant_id,

View File

@@ -16,6 +16,7 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context};
use aws_config::retry::{RetryConfigBuilder, RetryMode};
use aws_sdk_s3::config::Region;
use aws_sdk_s3::error::DisplayErrorContext;
use aws_sdk_s3::Client;
@@ -314,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
}
async fn init_s3_client(bucket_region: Region) -> Client {
let mut retry_config_builder = RetryConfigBuilder::new();
retry_config_builder
.set_max_attempts(Some(3))
.set_mode(Some(RetryMode::Adaptive));
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
.region(bucket_region)
.retry_config(retry_config_builder.build())
.load()
.await;
Client::new(&config)

View File

@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import S3Storage, s3_storage
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
# attach it, to drop any local state, then check it's still readable.
workload.stop()
drop_local_state(env, tenant_id)
workload.validate()
def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
"""
When we delete a timeline after a shard split, the child shards do not directly delete the
layers in the ancestor shards. They rely on the scrubber to clean up.
"""
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id,
timeline_id,
shard_count=None,
conf={
# Small layers and low compaction thresholds, so that when we split we can expect some to
# be dropped by child shards
"checkpoint_distance": f"{1024 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{1024 * 1024}",
"image_creation_threshold": "2",
"image_layer_creation_check_threshold": "0",
# Disable background compaction, we will do it explicitly
"compaction_period": "0s",
# No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
# and makes them GC'able
"pitr_interval": "0s",
},
)
# Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
new_shard_count = 4
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
#
# This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
# doesn't know about it, and won't perceive its ancestors as ancestors.
other_timeline_id = TimelineId.generate()
env.storage_controller.pageserver_api().timeline_create(
PgVersion.NOT_SET, tenant_id, other_timeline_id
)
# Write after split so that child shards have some indices in S3
workload.write_rows(100, upload=False)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
# The timeline still exists in child shards and they reference its layers, so scrubbing
# now shouldn't delete anything.
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] == 0
# Delete the timeline
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
# Subsequently doing physical GC should clean up the ancestor layers
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] > 0
def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
"""
Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards