mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
storage scrubber: GC ancestor shard layers (#8196)
## Problem After a shard split, the pageserver leaves the ancestor shard's content in place. It may be referenced by child shards, but eventually child shards will de-reference most ancestor layers as they write their own data and do GC. We would like to eventually clean up those ancestor layers to reclaim space. ## Summary of changes - Extend the physical GC command with `--mode=full`, which includes cleaning up unreferenced ancestor shard layers - Add test `test_scrubber_physical_gc_ancestors` - Remove colored log output: in testing this is irritating ANSI code spam in logs, and in interactive use doesn't add much. - Refactor storage controller API client code out of storcon_client into a `storage_controller/client` crate - During physical GC of ancestors, call into the storage controller to check that the latest shards seen in S3 reflect the latest state of the tenant, and there is no shard split in progress.
This commit is contained in:
committed by
Christian Schwarz
parent
9b883e4651
commit
affe408433
@@ -997,7 +997,7 @@ class NeonEnvBuilder:
|
||||
|
||||
if self.scrub_on_exit:
|
||||
try:
|
||||
StorageScrubber(self).scan_metadata()
|
||||
self.env.storage_scrubber.scan_metadata()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
@@ -1225,6 +1225,9 @@ class NeonEnv:
|
||||
)
|
||||
cfg["safekeepers"].append(sk_cfg)
|
||||
|
||||
# Scrubber instance for tests that use it, and for use during teardown checks
|
||||
self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
|
||||
|
||||
log.info(f"Config: {cfg}")
|
||||
self.neon_cli.init(
|
||||
cfg,
|
||||
@@ -4265,9 +4268,9 @@ class Safekeeper(LogUtils):
|
||||
|
||||
|
||||
class StorageScrubber:
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
def __init__(self, env: NeonEnv, log_dir: Path):
|
||||
self.env = env
|
||||
self.log_dir = log_dir or env.test_output_dir
|
||||
self.log_dir = log_dir
|
||||
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
@@ -4284,11 +4287,14 @@ class StorageScrubber:
|
||||
if s3_storage.endpoint is not None:
|
||||
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
|
||||
|
||||
base_args = [str(self.env.neon_binpath / "storage_scrubber")]
|
||||
base_args = [
|
||||
str(self.env.neon_binpath / "storage_scrubber"),
|
||||
f"--controller-api={self.env.storage_controller_api}",
|
||||
]
|
||||
args = base_args + args
|
||||
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.env.test_output_dir,
|
||||
self.log_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
@@ -4327,7 +4333,10 @@ class StorageScrubber:
|
||||
log.info(f"tenant-snapshot output: {stdout}")
|
||||
|
||||
def pageserver_physical_gc(
|
||||
self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
|
||||
self,
|
||||
min_age_secs: int,
|
||||
tenant_ids: Optional[list[TenantId]] = None,
|
||||
mode: Optional[str] = None,
|
||||
):
|
||||
args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
|
||||
|
||||
@@ -4337,6 +4346,9 @@ class StorageScrubber:
|
||||
for tenant_id in tenant_ids:
|
||||
args.extend(["--tenant-id", str(tenant_id)])
|
||||
|
||||
if mode is not None:
|
||||
args.extend(["--mode", mode])
|
||||
|
||||
stdout = self.scrubber_cli(
|
||||
args,
|
||||
timeout=30,
|
||||
|
||||
Reference in New Issue
Block a user