pageserver: post-shard-split layer rewrites (2/2) (#7531)

## Problem

- After a shard split of a large existing tenant, child tenants can end
up with oversized historic layers indefinitely, if those layers are
prevented from being GC'd by branchpoints.

This PR follows https://github.com/neondatabase/neon/pull/7531, and adds
rewriting of layers that contain a mixture of needed & un-needed
contents, in addition to dropping un-needed layers.

Closes: https://github.com/neondatabase/neon/issues/7504

## Summary of changes

- Add methods to ImageLayer for reading back existing layers
- Extend `compact_shard_ancestors` to rewrite layer files that contain a
mixture of keys that we want and keys we do not, if unwanted keys are
the majority of those in the file.
- Amend initialization code to handle multiple layers with the same
LayerName properly
- Get rid of of renaming bad layer files to `.old` since that's now
expected on restarts during rewrites.
This commit is contained in:
John Spray
2024-05-24 09:33:19 +01:00
committed by GitHub
parent c1f4028fc0
commit 3860bc9c6c
8 changed files with 545 additions and 190 deletions

View File

@@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils):
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
)
def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
def list_layers(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
) -> list[Path]:
"""
Inspect local storage on a pageserver to discover which layer files are present.

View File

@@ -177,7 +177,16 @@ def test_sharding_split_unsharded(
env.storage_controller.consistency_check()
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"failpoint",
[
None,
"compact-shard-ancestors-localonly",
"compact-shard-ancestors-enqueued",
"compact-shard-ancestors-persistent",
],
)
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
"""
Test that after a split, we clean up parent layer data in the child shards via compaction.
"""
@@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
"image_layer_creation_check_threshold": "0",
}
neon_env_builder.storage_controller_config = {
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
"max_unavailable": "300s"
}
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
# Split one shard into two
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
# Let all shards move into their stable locations, so that during subsequent steps we
# don't have reconciles in progress (simpler to reason about what messages we expect in logs)
env.storage_controller.reconcile_until_idle()
# Check we got the shard IDs we expected
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
@@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
# Compaction shouldn't make anything unreadable
workload.validate()
# Force a generation increase: layer rewrites are a long-term thing and only happen after
# the generation has increased.
env.pageserver.stop()
env.pageserver.start()
# Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
env.storage_controller.reconcile_until_idle()
for shard in shards:
ps = env.get_tenant_pageserver(shard)
# Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
# across restarts, as we will have local layer files that temporarily disagree with the remote metadata
# for the same local layer file name.
if failpoint is not None:
ps.http_client().configure_failpoints((failpoint, "exit"))
# Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
# Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
# We will compare stats before + after compaction
detail_before = ps.http_client().timeline_detail(shard, timeline_id)
# Invoke compaction: this should rewrite layers that are behind the pitr horizon
try:
ps.http_client().timeline_compact(shard, timeline_id)
except requests.ConnectionError as e:
if failpoint is None:
raise e
else:
log.info(f"Compaction failed (failpoint={failpoint}): {e}")
if failpoint in (
"compact-shard-ancestors-localonly",
"compact-shard-ancestors-enqueued",
):
# If we left local files that don't match remote metadata, we expect warnings on next startup
env.pageserver.allowed_errors.append(
".*removing local file .+ because it has unexpected length.*"
)
# Post-failpoint: we check that the pageserver comes back online happily.
env.pageserver.running = False
env.pageserver.start()
else:
assert failpoint is None # We shouldn't reach success path if a failpoint was set
detail_after = ps.http_client().timeline_detail(shard, timeline_id)
# Physical size should shrink because layers are smaller
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
# Validate size statistics
for shard in shards:
ps = env.get_tenant_pageserver(shard)
timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
reported_size = timeline_info["current_physical_size"]
layer_paths = ps.list_layers(shard, timeline_id)
measured_size = 0
for p in layer_paths:
abs_path = ps.timeline_dir(shard, timeline_id) / p
measured_size += os.stat(abs_path).st_size
log.info(
f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
)
if failpoint in (
"compact-shard-ancestors-localonly",
"compact-shard-ancestors-enqueued",
):
# If we injected a failure between local rewrite and remote upload, then after
# restart we may end up with neither version of the file on local disk (the new file
# is cleaned up because it doesn't matchc remote metadata). So local size isn't
# necessarily going to match remote physical size.
continue
assert measured_size == reported_size
# Compaction shouldn't make anything unreadable
workload.validate()
def test_sharding_split_smoke(
neon_env_builder: NeonEnvBuilder,