mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
gc: only decrement resident size if GC'd layer is resident
Before this patch, GC would call PersistentLayer::delete() on every GC'ed layer. RemoteLayer::delete() returned Ok(()) unconditionally. GC would then proceed by decrementing the resident size metric, even though the layer is a RemoteLayer. This patch makes the following changes: - Rename PersistentLayer::delete() to delete_resident_layer_file(). That name is unambiguous. - Make RemoteLayer::delete_resident_layer_file return an Err(). We would have uncovered this bug if we had done that from the start. - Change GC / Timeline::delete_historic_layer check whether the layer is remote or not, and only call delete_resident_layer_file() if it's not remote. This brings us in line with how eviction does it. - Add a regression test. fixes https://github.com/neondatabase/neon/issues/3722
This commit is contained in:
committed by
Christian Schwarz
parent
1b9b9d60d4
commit
38022ff11c
@@ -364,7 +364,7 @@ pub trait PersistentLayer: Layer {
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
None
|
||||
|
||||
@@ -438,7 +438,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
|
||||
@@ -252,7 +252,7 @@ impl PersistentLayer for ImageLayer {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
|
||||
@@ -155,8 +155,8 @@ impl PersistentLayer for RemoteLayer {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
Ok(())
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
}
|
||||
|
||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
|
||||
@@ -1076,7 +1076,7 @@ impl Timeline {
|
||||
|
||||
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
|
||||
Replacement::Replaced { .. } => {
|
||||
if let Err(e) = local_layer.delete() {
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
}
|
||||
// Always decrement the physical size gauge, even if we failed to delete the file.
|
||||
@@ -1950,11 +1950,14 @@ impl Timeline {
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer_size = layer.file_size();
|
||||
|
||||
layer.delete()?;
|
||||
if let Some(layer_size) = layer_size {
|
||||
self.metrics.resident_physical_size_gauge.sub(layer_size);
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
|
||||
@@ -14,6 +14,7 @@ import tempfile
|
||||
import textwrap
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Flag, auto
|
||||
@@ -1516,6 +1517,11 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TenantConfig:
|
||||
@@ -1551,6 +1557,14 @@ class LayerMapInfo:
|
||||
|
||||
return info
|
||||
|
||||
def kind_count(self) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = defaultdict(int)
|
||||
for inmem_layer in self.in_memory_layers:
|
||||
counts[inmem_layer.kind] += 1
|
||||
for hist_layer in self.historic_layers:
|
||||
counts[hist_layer.kind] += 1
|
||||
return counts
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryLayerInfo:
|
||||
@@ -1567,7 +1581,7 @@ class InMemoryLayerInfo:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class HistoricLayerInfo:
|
||||
kind: str
|
||||
layer_file_name: str
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
@@ -138,3 +143,164 @@ def test_basic_eviction(
|
||||
assert (
|
||||
redownloaded_layer_map_info == initial_layer_map_info
|
||||
), "Should have the same layer map after redownloading the evicted layers"
|
||||
|
||||
|
||||
def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_gc_of_remote_layers",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_config = {
|
||||
"pitr_interval": "1s", # set to non-zero, so GC actually does something
|
||||
"gc_period": "0s", # we want to control when GC runs
|
||||
"compaction_period": "0s", # we want to control when compaction runs
|
||||
"checkpoint_timeout": "24h", # something we won't reach
|
||||
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
|
||||
"compaction_threshold": "3",
|
||||
# "image_creation_threshold": set at runtime
|
||||
"compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
|
||||
}
|
||||
|
||||
def tenant_update_config(changes):
|
||||
tenant_config.update(changes)
|
||||
env.neon_cli.config_tenant(tenant_id, tenant_config)
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
|
||||
log.info("tenant id is %s", tenant_id)
|
||||
env.initial_tenant = tenant_id # update_and_gc relies on this
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
log.info("fill with data, creating delta & image layers, some of which are GC'able after")
|
||||
# no particular reason to create the layers like this, but we are sure
|
||||
# not to hit the image_creation_threshold here.
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
|
||||
cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Create delta layers, then turn them into image layers.
|
||||
# Do it multiple times so that there's something to GC.
|
||||
for k in range(0, 2):
|
||||
# produce delta layers => disable image layer creation by setting high threshold
|
||||
tenant_update_config({"image_creation_threshold": "100"})
|
||||
for i in range(0, 2):
|
||||
for j in range(0, 3):
|
||||
# create a minimal amount of "delta difficulty" for this table
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("update a set some_value = -some_value + %s", (j,))
|
||||
|
||||
with pg.cursor() as cur:
|
||||
# vacuuming should aid to reuse keys, though it's not really important
|
||||
# with image_creation_threshold=1 which we will use on the last compaction
|
||||
cur.execute("vacuum")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
if i == 1 and j == 2 and k == 1:
|
||||
# last iteration; stop before checkpoint to avoid leaving an inmemory layer
|
||||
pg.stop_and_destroy()
|
||||
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# images should not yet be created, because threshold is too high,
|
||||
# but these will be reshuffled to L1 layers
|
||||
ps_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
for _ in range(0, 20):
|
||||
# loop in case flushing is still in progress
|
||||
layers = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
if not layers.in_memory_layers:
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
# now that we've grown some delta layers, turn them into image layers
|
||||
tenant_update_config({"image_creation_threshold": "1"})
|
||||
ps_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
# wait for all uploads to finish
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
tenant_id, timeline_id, env.safekeepers, env.pageserver
|
||||
)
|
||||
|
||||
# shutdown safekeepers to avoid on-demand downloads from walreceiver
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
log.info("ensure the code above produced image and delta layers")
|
||||
pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", pre_evict_info)
|
||||
by_kind = pre_evict_info.kind_count()
|
||||
log.info("by kind: %s", by_kind)
|
||||
assert by_kind["Image"] > 0
|
||||
assert by_kind["Delta"] > 0
|
||||
assert by_kind["InMemory"] == 0
|
||||
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
|
||||
log.info("resident layers count before eviction: %s", len(resident_layers))
|
||||
|
||||
log.info("evict all layers")
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
|
||||
def ensure_resident_and_remote_size_metrics():
|
||||
log.info("ensure that all the layers are gone")
|
||||
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
|
||||
# we have disabled all background loops, so, this should hold
|
||||
assert len(resident_layers) == 0
|
||||
|
||||
info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", info)
|
||||
|
||||
log.info("ensure that resident_physical_size metric is zero")
|
||||
resident_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_resident_physical_size"
|
||||
)
|
||||
assert resident_physical_size_metric == 0
|
||||
log.info("ensure that resident_physical_size metric corresponds to layer map dump")
|
||||
assert resident_physical_size_metric == sum(
|
||||
[layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
|
||||
)
|
||||
|
||||
# TODO: the following would be nice to assert, but for some reason, the commented-out
|
||||
# assert below fails with 113401856.0 != 140427264
|
||||
# => https://github.com/neondatabase/neon/issues/3738
|
||||
#
|
||||
# log.info("ensure that remote_physical_size metric matches layer map")
|
||||
# remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
# tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
# )
|
||||
# log.info("ensure that remote_physical_size metric corresponds to layer map dump")
|
||||
# assert remote_physical_size_metric == sum(
|
||||
# [layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote]
|
||||
# )
|
||||
|
||||
log.info("before runnning GC, ensure that remote_physical size is zero")
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
log.info("run GC")
|
||||
time.sleep(2) # let pitr_interval + 1 second pass
|
||||
ps_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
time.sleep(1)
|
||||
assert not env.pageserver.log_contains("Nothing to GC")
|
||||
|
||||
log.info("ensure GC deleted some layers, otherwise this test is pointless")
|
||||
post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", post_gc_info)
|
||||
log.info("by kind: %s", post_gc_info.kind_count())
|
||||
pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers])
|
||||
post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers])
|
||||
assert post_gc_layers.issubset(pre_evict_layers)
|
||||
assert len(post_gc_layers) < len(pre_evict_layers)
|
||||
|
||||
log.info("update_gc_info might download some layers. Evict them again.")
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
|
||||
log.info("after running GC, ensure that resident size is still zero")
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
Reference in New Issue
Block a user