storage scrubber: GC ancestor shard layers (#8196)

## Problem

After a shard split, the pageserver leaves the ancestor shard's content
in place. It may be referenced by child shards, but eventually child
shards will de-reference most ancestor layers as they write their own
data and do GC. We would like to eventually clean up those ancestor
layers to reclaim space.

## Summary of changes

- Extend the physical GC command with `--mode=full`, which includes
cleaning up unreferenced ancestor shard layers
- Add test `test_scrubber_physical_gc_ancestors`
- Remove colored log output: in testing this is irritating ANSI code
spam in logs, and in interactive use doesn't add much.
- Refactor storage controller API client code out of storcon_client into
a `storage_controller/client` crate
- During physical GC of ancestors, call into the storage controller to
check that the latest shards seen in S3 reflect the latest state of the
tenant, and there is no shard split in progress.
This commit is contained in:
John Spray
2024-07-19 17:07:59 +01:00
committed by GitHub
parent 16071e57c6
commit 44781518d0
24 changed files with 905 additions and 191 deletions

View File

@@ -997,7 +997,7 @@ class NeonEnvBuilder:
if self.scrub_on_exit:
try:
StorageScrubber(self).scan_metadata()
self.env.storage_scrubber.scan_metadata()
except Exception as e:
log.error(f"Error during remote storage scrub: {e}")
cleanup_error = e
@@ -1225,6 +1225,9 @@ class NeonEnv:
)
cfg["safekeepers"].append(sk_cfg)
# Scrubber instance for tests that use it, and for use during teardown checks
self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
log.info(f"Config: {cfg}")
self.neon_cli.init(
cfg,
@@ -4265,9 +4268,9 @@ class Safekeeper(LogUtils):
class StorageScrubber:
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
def __init__(self, env: NeonEnv, log_dir: Path):
self.env = env
self.log_dir = log_dir or env.test_output_dir
self.log_dir = log_dir
def scrubber_cli(self, args: list[str], timeout) -> str:
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -4284,11 +4287,14 @@ class StorageScrubber:
if s3_storage.endpoint is not None:
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
base_args = [str(self.env.neon_binpath / "storage_scrubber")]
base_args = [
str(self.env.neon_binpath / "storage_scrubber"),
f"--controller-api={self.env.storage_controller_api}",
]
args = base_args + args
(output_path, stdout, status_code) = subprocess_capture(
self.env.test_output_dir,
self.log_dir,
args,
echo_stderr=True,
echo_stdout=True,
@@ -4327,7 +4333,10 @@ class StorageScrubber:
log.info(f"tenant-snapshot output: {stdout}")
def pageserver_physical_gc(
self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
self,
min_age_secs: int,
tenant_ids: Optional[list[TenantId]] = None,
mode: Optional[str] = None,
):
args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
@@ -4337,6 +4346,9 @@ class StorageScrubber:
for tenant_id in tenant_ids:
args.extend(["--tenant-id", str(tenant_id)])
if mode is not None:
args.extend(["--mode", mode])
stdout = self.scrubber_cli(
args,
timeout=30,

View File

@@ -22,7 +22,6 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
StorageScrubber,
generate_uploads_and_deletions,
)
from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +214,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
# Having written a mixture of generation-aware and legacy index_part.json,
# ensure the scrubber handles the situation as expected.
metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
metadata_summary = env.storage_scrubber.scan_metadata()
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["timeline_count"] == 1
assert metadata_summary["timeline_shard_count"] == 1

View File

@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
import pytest
from fixtures.common_types import TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.utils import (
assert_prefix_empty,
@@ -234,7 +234,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
# Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
# that the scrubber sees it and cleans it up. We do this before the final attach+validate pass,
# to also validate that the scrubber isn't breaking anything.
gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] > 0
@@ -555,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
# Scrub the remote storage
# ========================
# This confirms that the scrubber isn't upset by the presence of the heatmap
StorageScrubber(neon_env_builder).scan_metadata()
env.storage_scrubber.scan_metadata()
# Detach secondary and delete tenant
# ===================================

View File

@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
StorageControllerApiException,
StorageScrubber,
last_flush_lsn_upload,
tenant_get_shards,
wait_for_last_flush_lsn,
@@ -128,7 +127,7 @@ def test_sharding_smoke(
# Check the scrubber isn't confused by sharded content, then disable
# it during teardown because we'll have deleted by then
StorageScrubber(neon_env_builder).scan_metadata()
env.storage_scrubber.scan_metadata()
neon_env_builder.scrub_on_exit = False
env.storage_controller.pageserver_api().tenant_delete(tenant_id)

View File

@@ -1,14 +1,19 @@
import os
import shutil
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
import pytest
from fixtures.common_types import TenantId, TenantShardId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
StorageScrubber,
)
from fixtures.remote_storage import S3Storage, s3_storage
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -60,8 +65,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
output_path = neon_env_builder.test_output_dir / "snapshot"
os.makedirs(output_path)
scrubber = StorageScrubber(neon_env_builder)
scrubber.tenant_snapshot(tenant_id, output_path)
env.storage_scrubber.tenant_snapshot(tenant_id, output_path)
assert len(os.listdir(output_path)) > 0
@@ -111,6 +115,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
workload.validate()
def drop_local_state(env: NeonEnv, tenant_id: TenantId):
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
env.storage_controller.reconcile_until_idle()
env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
env.storage_controller.reconcile_until_idle()
@pytest.mark.parametrize("shard_count", [None, 4])
def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
@@ -133,28 +145,231 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
# For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads
for _i in range(0, n_cycles):
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
env.storage_controller.reconcile_until_idle()
env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
env.storage_controller.reconcile_until_idle()
drop_local_state(env, tenant_id)
# This write includes remote upload, will generate an index in this generation
workload.write_rows(1)
# With a high min_age, the scrubber should decline to delete anything
gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
# If targeting a different tenant, the scrubber shouldn't do anything
gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
gc_summary = env.storage_scrubber.pageserver_physical_gc(
min_age_secs=1, tenant_ids=[TenantId.generate()]
)
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
# With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
@pytest.mark.parametrize("shard_count", [None, 2])
def test_scrubber_physical_gc_ancestors(
neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
):
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id,
timeline_id,
shard_count=shard_count,
conf={
# Small layers and low compaction thresholds, so that when we split we can expect some to
# be dropped by child shards
"checkpoint_distance": f"{1024 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{1024 * 1024}",
"image_creation_threshold": "2",
"image_layer_creation_check_threshold": "0",
# Disable background compaction, we will do it explicitly
"compaction_period": "0s",
# No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
# and makes them GC'able
"pitr_interval": "0s",
},
)
# Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
new_shard_count = 4
assert shard_count is None or new_shard_count > shard_count
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
# Make sure child shards have some layers
workload.write_rows(100)
# Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
# a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even
# if they were logically deleted before the shard split, just not physically deleted yet because of the queue.
for ps in env.pageservers:
ps.http_client().deletion_queue_flush(execute=True)
# Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber
# should not erase any ancestor layers
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] == 0
# Write some data and compact: compacting, some ancestor layers should no longer be needed by children
# (the compaction is part of the checkpoint that Workload does for us)
workload.churn_rows(100)
workload.churn_rows(100)
workload.churn_rows(100)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
ps.http_client().timeline_compact(shard, timeline_id)
ps.http_client().timeline_gc(shard, timeline_id, 0)
# We will use a min_age_secs=1 threshold for deletion, let it pass
time.sleep(2)
# Our time threshold should be respected: check that with a high threshold we delete nothing
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] == 0
# Now run with a low time threshold: deletions of ancestor layers should be executed
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] > 0
# We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and
# attach it, to drop any local state, then check it's still readable.
workload.stop()
drop_local_state(env, tenant_id)
workload.validate()
def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
"""
Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly
GC any ancestor layers.
"""
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
initial_shard_count = 2
env.neon_cli.create_tenant(
tenant_id,
timeline_id,
shard_count=initial_shard_count,
conf={
# Small layers and low compaction thresholds, so that when we split we can expect some to
# be dropped by child shards
"checkpoint_distance": f"{1024 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{1024 * 1024}",
"image_creation_threshold": "2",
"image_layer_creation_check_threshold": "0",
# Disable background compaction, we will do it explicitly
"compaction_period": "0s",
# No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
# and makes them GC'able
"pitr_interval": "0s",
},
)
unstuck = threading.Event()
def stuck_split():
# Pause our shard split after the first shard but before the second, such that when we run
# the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304).
env.storage_controller.configure_failpoints(
("shard-split-post-remote-sleep", "return(3600000)")
)
try:
split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
except Exception as e:
log.info(f"Split failed with {e}")
else:
if not unstuck.is_set():
raise RuntimeError(f"Split succeeded unexpectedly ({split_response})")
with ThreadPoolExecutor(max_workers=1) as threads:
log.info("Starting hung shard split")
stuck_split_fut = threads.submit(stuck_split)
# Let the controller reach the failpoint
wait_until(
10,
1,
lambda: env.storage_controller.assert_log_contains(
'failpoint "shard-split-post-remote-sleep": sleeping'
),
)
# Run compaction on the new child shards, so that they drop some refs to their parent
child_shards = [
TenantShardId(tenant_id, 0, 4),
TenantShardId(tenant_id, 2, 4),
]
log.info("Compacting first two children")
for child in child_shards:
env.get_tenant_pageserver(
TenantShardId(tenant_id, 0, initial_shard_count)
).http_client().timeline_compact(child, timeline_id)
# Check that the other child shards weren't created
assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None
assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None
# Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all
# ancestor layers as a reason to GC them, because it should realize that a split is in progress.
# (GC requires that controller does not indicate split in progress, and that if we see the highest
# shard count N, then there are N shards present with that shard count).
gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
log.info(f"Ran physical GC partway through split: {gc_output}")
assert gc_output["ancestor_layers_deleted"] == 0
assert gc_output["remote_storage_errors"] == 0
assert gc_output["controller_api_errors"] == 0
# Storage controller shutdown lets our split request client complete
log.info("Stopping storage controller")
unstuck.set()
env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*")
env.storage_controller.stop()
stuck_split_fut.result()
# Restart the controller and retry the split with the failpoint disabled, this should
# complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers
log.info("Starting & retrying split")
env.storage_controller.start()
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
# The other child shards exist now, we can compact them to drop refs to ancestor
log.info("Compacting second two children")
for child in [
TenantShardId(tenant_id, 1, 4),
TenantShardId(tenant_id, 3, 4),
]:
env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id)
gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
log.info(f"Ran physical GC after split completed: {gc_output}")
assert gc_output["ancestor_layers_deleted"] > 0
assert gc_output["remote_storage_errors"] == 0
assert gc_output["controller_api_errors"] == 0

View File

@@ -5,7 +5,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
StorageScrubber,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
@@ -325,7 +324,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
remote_storage_kind = RemoteStorageKind.MOCK_S3
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
scrubber = StorageScrubber(neon_env_builder)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
ps_http = env.pageserver.http_client()
@@ -340,7 +338,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
env.stop()
result = scrubber.scan_metadata()
result = env.storage_scrubber.scan_metadata()
assert result["with_warnings"] == []
env.start()
@@ -348,5 +346,5 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
ps_http.tenant_delete(tenant_id)
env.stop()
scrubber.scan_metadata()
env.storage_scrubber.scan_metadata()
assert result["with_warnings"] == []