From 38022ff11c1307d36b83070612d705f07f5f3437 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 1 Mar 2023 17:54:59 +0100 Subject: [PATCH] gc: only decrement resident size if GC'd layer is resident Before this patch, GC would call PersistentLayer::delete() on every GC'ed layer. RemoteLayer::delete() returned Ok(()) unconditionally. GC would then proceed by decrementing the resident size metric, even though the layer is a RemoteLayer. This patch makes the following changes: - Rename PersistentLayer::delete() to delete_resident_layer_file(). That name is unambiguous. - Make RemoteLayer::delete_resident_layer_file return an Err(). We would have uncovered this bug if we had done that from the start. - Change GC / Timeline::delete_historic_layer check whether the layer is remote or not, and only call delete_resident_layer_file() if it's not remote. This brings us in line with how eviction does it. - Add a regression test. fixes https://github.com/neondatabase/neon/issues/3722 --- pageserver/src/tenant/storage_layer.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 2 +- .../src/tenant/storage_layer/image_layer.rs | 2 +- .../src/tenant/storage_layer/remote_layer.rs | 4 +- pageserver/src/tenant/timeline.rs | 15 +- test_runner/fixtures/neon_fixtures.py | 16 +- test_runner/regress/test_layer_eviction.py | 166 ++++++++++++++++++ 7 files changed, 195 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9198cfd1df..52ce2cab42 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -364,7 +364,7 @@ pub trait PersistentLayer: Layer { } /// Permanently remove this layer from disk. - fn delete(&self) -> Result<()>; + fn delete_resident_layer_file(&self) -> Result<()>; fn downcast_remote_layer(self: Arc) -> Option> { None diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4d1e08322d..37719dfce5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -438,7 +438,7 @@ impl PersistentLayer for DeltaLayer { )) } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index e48abd38dd..e37e001eda 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -252,7 +252,7 @@ impl PersistentLayer for ImageLayer { unimplemented!(); } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 8465a99339..dbce2e7888 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -155,8 +155,8 @@ impl PersistentLayer for RemoteLayer { bail!("cannot iterate a remote layer"); } - fn delete(&self) -> Result<()> { - Ok(()) + fn delete_resident_layer_file(&self) -> Result<()> { + bail!("remote layer has no layer file"); } fn downcast_remote_layer<'a>(self: Arc) -> Option> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ca0cd04bd0..101b27bb97 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1076,7 +1076,7 @@ impl Timeline { let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { Replacement::Replaced { .. } => { - if let Err(e) = local_layer.delete() { + if let Err(e) = local_layer.delete_resident_layer_file() { error!("failed to remove layer file on evict after replacement: {e:#?}"); } // Always decrement the physical size gauge, even if we failed to delete the file. @@ -1950,11 +1950,14 @@ impl Timeline { layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, ) -> anyhow::Result<()> { - let layer_size = layer.file_size(); - - layer.delete()?; - if let Some(layer_size) = layer_size { - self.metrics.resident_physical_size_gauge.sub(layer_size); + if !layer.is_remote_layer() { + layer.delete_resident_layer_file()?; + let layer_file_size = layer + .file_size() + .expect("Local layer should have a file size"); + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); } // TODO Removing from the bottom of the layer map is expensive. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 56b56b8578..49218f3c98 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import tempfile import textwrap import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto @@ -1516,6 +1517,11 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 + def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + @dataclass class TenantConfig: @@ -1551,6 +1557,14 @@ class LayerMapInfo: return info + def kind_count(self) -> Dict[str, int]: + counts: Dict[str, int] = defaultdict(int) + for inmem_layer in self.in_memory_layers: + counts[inmem_layer.kind] += 1 + for hist_layer in self.historic_layers: + counts[hist_layer.kind] += 1 + return counts + @dataclass class InMemoryLayerInfo: @@ -1567,7 +1581,7 @@ class InMemoryLayerInfo: ) -@dataclass +@dataclass(frozen=True) class HistoricLayerInfo: kind: str layer_file_name: str diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index a03dd88c41..404bd67050 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -1,8 +1,13 @@ +import time + import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, + wait_for_last_flush_lsn, wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -138,3 +143,164 @@ def test_basic_eviction( assert ( redownloaded_layer_map_info == initial_layer_map_info ), "Should have the same layer map after redownloading the evicted layers" + + +def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_gc_of_remote_layers", + ) + + env = neon_env_builder.init_start() + + tenant_config = { + "pitr_interval": "1s", # set to non-zero, so GC actually does something + "gc_period": "0s", # we want to control when GC runs + "compaction_period": "0s", # we want to control when compaction runs + "checkpoint_timeout": "24h", # something we won't reach + "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually + "compaction_threshold": "3", + # "image_creation_threshold": set at runtime + "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + } + + def tenant_update_config(changes): + tenant_config.update(changes) + env.neon_cli.config_tenant(tenant_id, tenant_config) + + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config) + log.info("tenant id is %s", tenant_id) + env.initial_tenant = tenant_id # update_and_gc relies on this + ps_http = env.pageserver.http_client() + + pg = env.postgres.create_start("main") + + log.info("fill with data, creating delta & image layers, some of which are GC'able after") + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a (id bigserial primary key, some_value bigint not null)") + cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # Create delta layers, then turn them into image layers. + # Do it multiple times so that there's something to GC. + for k in range(0, 2): + # produce delta layers => disable image layer creation by setting high threshold + tenant_update_config({"image_creation_threshold": "100"}) + for i in range(0, 2): + for j in range(0, 3): + # create a minimal amount of "delta difficulty" for this table + with pg.cursor() as cur: + cur.execute("update a set some_value = -some_value + %s", (j,)) + + with pg.cursor() as cur: + # vacuuming should aid to reuse keys, though it's not really important + # with image_creation_threshold=1 which we will use on the last compaction + cur.execute("vacuum") + + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + if i == 1 and j == 2 and k == 1: + # last iteration; stop before checkpoint to avoid leaving an inmemory layer + pg.stop_and_destroy() + + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # images should not yet be created, because threshold is too high, + # but these will be reshuffled to L1 layers + ps_http.timeline_compact(tenant_id, timeline_id) + + for _ in range(0, 20): + # loop in case flushing is still in progress + layers = ps_http.layer_map_info(tenant_id, timeline_id) + if not layers.in_memory_layers: + break + time.sleep(0.2) + + # now that we've grown some delta layers, turn them into image layers + tenant_update_config({"image_creation_threshold": "1"}) + ps_http.timeline_compact(tenant_id, timeline_id) + + # wait for all uploads to finish + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + # shutdown safekeepers to avoid on-demand downloads from walreceiver + for sk in env.safekeepers: + sk.stop() + + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info("ensure the code above produced image and delta layers") + pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", pre_evict_info) + by_kind = pre_evict_info.kind_count() + log.info("by kind: %s", by_kind) + assert by_kind["Image"] > 0 + assert by_kind["Delta"] > 0 + assert by_kind["InMemory"] == 0 + resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + log.info("resident layers count before eviction: %s", len(resident_layers)) + + log.info("evict all layers") + ps_http.evict_all_layers(tenant_id, timeline_id) + + def ensure_resident_and_remote_size_metrics(): + log.info("ensure that all the layers are gone") + resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + # we have disabled all background loops, so, this should hold + assert len(resident_layers) == 0 + + info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", info) + + log.info("ensure that resident_physical_size metric is zero") + resident_physical_size_metric = ps_http.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + assert resident_physical_size_metric == 0 + log.info("ensure that resident_physical_size metric corresponds to layer map dump") + assert resident_physical_size_metric == sum( + [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote] + ) + + # TODO: the following would be nice to assert, but for some reason, the commented-out + # assert below fails with 113401856.0 != 140427264 + # => https://github.com/neondatabase/neon/issues/3738 + # + # log.info("ensure that remote_physical_size metric matches layer map") + # remote_physical_size_metric = ps_http.get_timeline_metric( + # tenant_id, timeline_id, "pageserver_remote_physical_size" + # ) + # log.info("ensure that remote_physical_size metric corresponds to layer map dump") + # assert remote_physical_size_metric == sum( + # [layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote] + # ) + + log.info("before runnning GC, ensure that remote_physical size is zero") + ensure_resident_and_remote_size_metrics() + + log.info("run GC") + time.sleep(2) # let pitr_interval + 1 second pass + ps_http.timeline_gc(tenant_id, timeline_id, 0) + time.sleep(1) + assert not env.pageserver.log_contains("Nothing to GC") + + log.info("ensure GC deleted some layers, otherwise this test is pointless") + post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", post_gc_info) + log.info("by kind: %s", post_gc_info.kind_count()) + pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers]) + post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers]) + assert post_gc_layers.issubset(pre_evict_layers) + assert len(post_gc_layers) < len(pre_evict_layers) + + log.info("update_gc_info might download some layers. Evict them again.") + ps_http.evict_all_layers(tenant_id, timeline_id) + + log.info("after running GC, ensure that resident size is still zero") + ensure_resident_and_remote_size_metrics()