Files
neon/test_runner/regress/test_layer_eviction.py
Heikki Linnakangas 74d09b78c7 Keep walproposer alive until shutdown checkpoint is safe on safekepeers
The walproposer pretends to be a walsender in many ways. It has a
WalSnd slot, it claims to be a walsender by calling
MarkPostmasterChildWalSender() etc. But one different to real
walsenders was that the postmaster still treated it as a bgworker
rather than a walsender. The difference is that at shutdown,
walsenders are not killed until the very end, after the checkpointer
process has written the shutdown checkpoint and exited.

As a result, the walproposer always got killed before the shutdown
checkpoint was written, so the shutdown checkpoint never made it to
safekeepers. That's fine in principle, we don't require a clean
shutdown after all. But it also feels a bit silly not to stream the
shutdown checkpoint. It could be useful for initializing hot standby
mode in a read replica, for example.

Change postmaster to treat background workers that have called
MarkPostmasterChildWalSender() as walsenders. That unfortunately
requires another small change in postgres core.

After doing that, walproposers stay alive longer. However, it also
means that the checkpointer will wait for the walproposer to switch to
WALSNDSTATE_STOPPING state, when the checkpointer sends the
PROCSIG_WALSND_INIT_STOPPING signal. We don't have the machinery in
walproposer to receive and handle that signal reliably. Instead, we
mark walproposer as being in WALSNDSTATE_STOPPING always.

In commit 568f91420a, I assumed that shutdown will wait for all the
remaining WAL to be streamed to safekeepers, but before this commit
that was not true, and the test became flaky. This should make it
stable again.

Some tests wrongly assumed that no WAL could have been written between
pg_current_wal_flush_lsn and quick pg stop after it. Fix them by introducing
flush_ep_to_pageserver which first stops the endpoint and then waits till all
committed WAL reaches the pageserver.

In passing extract safekeeper http client to its own module.
2024-03-11 23:29:32 +04:00

303 lines
13 KiB
Python

import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
flush_ep_to_pageserver,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import wait_for_upload
from fixtures.remote_storage import RemoteStorageKind
# Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
# and then download them back.
def test_basic_eviction(
neon_env_builder: NeonEnvBuilder,
build_type: str,
):
if build_type == "debug":
pytest.skip("times out in debug builds")
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
# disable gc and compaction background loops because they perform on-demand downloads
"gc_period": "0s",
"compaction_period": "0s",
}
)
client = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Create a number of layers in the tenant
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 5000000) g
"""
)
# stops the endpoint
current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
client.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
# stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
# has already been stopped by flush_ep_to_pageserver
for sk in env.safekeepers:
sk.stop()
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
assert (
len(initial_local_layers) > 1
), f"Should create multiple layers for timeline, but got {initial_local_layers}"
# Compare layer map dump with the local layers, ensure everything's present locally and matches
initial_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
assert (
not initial_layer_map_info.in_memory_layers
), "Should have no in memory layers after flushing"
assert len(initial_local_layers) == len(
initial_layer_map_info.historic_layers
), "Should have the same layers in memory and on disk"
for returned_layer in initial_layer_map_info.historic_layers:
assert (
returned_layer.kind == "Delta"
), f"Did not create and expect image layers, but got {returned_layer}"
assert (
not returned_layer.remote
), f"All created layers should be present locally, but got {returned_layer}"
local_layers = list(
filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers)
)
assert (
len(local_layers) == 1
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
local_layer = local_layers[0]
assert (
returned_layer.layer_file_size == local_layer.stat().st_size
), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}"
# Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map
for local_layer in initial_local_layers:
client.evict_layer(
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
)
assert not any(
new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*")
), f"Did not expect to find {local_layer} layer after evicting"
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
assert (
not evicted_layer_map_info.in_memory_layers
), "Should have no in memory layers after flushing and evicting"
assert len(initial_local_layers) == len(
evicted_layer_map_info.historic_layers
), "Should have the same layers in memory and on disk initially"
for returned_layer in evicted_layer_map_info.historic_layers:
assert (
returned_layer.kind == "Delta"
), f"Did not create and expect image layers, but got {returned_layer}"
assert (
returned_layer.remote
), f"All layers should be evicted and not present locally, but got {returned_layer}"
assert any(
local_layer.name == returned_layer.layer_file_name
for local_layer in initial_local_layers
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
# redownload all evicted layers and ensure the initial state is restored
for local_layer in initial_local_layers:
client.download_layer(
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
)
client.timeline_download_remote_layers(
tenant_id,
timeline_id,
# allow some concurrency to unveil potential concurrency bugs
max_concurrent_downloads=10,
errors_ok=False,
at_least_one_download=False,
)
redownloaded_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
assert (
redownloaded_layers == initial_local_layers
), "Should have the same layers locally after redownloading the evicted layers"
redownloaded_layer_map_info = client.layer_map_info(
tenant_id=tenant_id, timeline_id=timeline_id
)
assert (
redownloaded_layer_map_info == initial_layer_map_info
), "Should have the same layer map after redownloading the evicted layers"
def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
tenant_config = {
"pitr_interval": "1s", # set to non-zero, so GC actually does something
"gc_period": "0s", # we want to control when GC runs
"compaction_period": "0s", # we want to control when compaction runs
"checkpoint_timeout": "24h", # something we won't reach
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
"compaction_threshold": "3",
# "image_creation_threshold": set at runtime
"compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
}
def tenant_update_config(changes):
tenant_config.update(changes)
env.neon_cli.config_tenant(tenant_id, tenant_config)
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
log.info("tenant id is %s", tenant_id)
env.initial_tenant = tenant_id # update_and_gc relies on this
ps_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
log.info("fill with data, creating delta & image layers, some of which are GC'able after")
# no particular reason to create the layers like this, but we are sure
# not to hit the image_creation_threshold here.
with endpoint.cursor() as cur:
cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Create delta layers, then turn them into image layers.
# Do it multiple times so that there's something to GC.
for k in range(0, 2):
# produce delta layers => disable image layer creation by setting high threshold
tenant_update_config({"image_creation_threshold": "100"})
for i in range(0, 2):
for j in range(0, 3):
# create a minimal amount of "delta difficulty" for this table
with endpoint.cursor() as cur:
cur.execute("update a set some_value = -some_value + %s", (j,))
with endpoint.cursor() as cur:
# vacuuming should aid to reuse keys, though it's not really important
# with image_creation_threshold=1 which we will use on the last compaction
cur.execute("vacuum")
last_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
if i == 1 and j == 2 and k == 1:
# last iteration; stop before checkpoint to avoid leaving an inmemory layer
endpoint.stop_and_destroy()
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# images should not yet be created, because threshold is too high,
# but these will be reshuffled to L1 layers
ps_http.timeline_compact(tenant_id, timeline_id)
for _ in range(0, 20):
# loop in case flushing is still in progress
layers = ps_http.layer_map_info(tenant_id, timeline_id)
if not layers.in_memory_layers:
break
time.sleep(0.2)
# now that we've grown some delta layers, turn them into image layers
tenant_update_config({"image_creation_threshold": "1"})
ps_http.timeline_compact(tenant_id, timeline_id)
# wait for all uploads to finish (checkpoint has been done above)
wait_for_upload(ps_http, tenant_id, timeline_id, last_lsn)
# shutdown safekeepers to avoid on-demand downloads from walreceiver
for sk in env.safekeepers:
sk.stop()
ps_http.timeline_checkpoint(tenant_id, timeline_id)
log.info("ensure the code above produced image and delta layers")
pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id)
log.info("layer map dump: %s", pre_evict_info)
by_kind = pre_evict_info.kind_count()
log.info("by kind: %s", by_kind)
assert by_kind["Image"] > 0
assert by_kind["Delta"] > 0
assert by_kind["InMemory"] == 0
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
log.info("resident layers count before eviction: %s", len(resident_layers))
log.info("evict all layers")
ps_http.evict_all_layers(tenant_id, timeline_id)
def ensure_resident_and_remote_size_metrics():
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
# we have disabled all background loops, so, this should hold
assert len(resident_layers) == 0, "ensure that all the layers are gone"
info = ps_http.layer_map_info(tenant_id, timeline_id)
log.info("layer map dump: %s", info)
resident_physical_size_metric = ps_http.get_timeline_metric(
tenant_id, timeline_id, "pageserver_resident_physical_size"
)
assert (
resident_physical_size_metric == 0
), "ensure that resident_physical_size metric is zero"
assert resident_physical_size_metric == sum(
layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
), "ensure that resident_physical_size metric corresponds to layer map dump"
remote_physical_size_metric = ps_http.get_timeline_metric(
tenant_id, timeline_id, "pageserver_remote_physical_size"
)
assert remote_physical_size_metric == sum(
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
), "ensure that remote_physical_size metric corresponds to layer map dump"
log.info("before runnning GC, ensure that remote_physical size is zero")
# leaving index_part.json upload from successful compaction out will show
# up here as a mismatch between remove_physical_size and summed up layermap
# size
ensure_resident_and_remote_size_metrics()
log.info("run GC")
time.sleep(2) # let pitr_interval + 1 second pass
ps_http.timeline_gc(tenant_id, timeline_id, 0)
time.sleep(1)
assert not env.pageserver.log_contains("Nothing to GC")
log.info("ensure GC deleted some layers, otherwise this test is pointless")
post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id)
log.info("layer map dump: %s", post_gc_info)
log.info("by kind: %s", post_gc_info.kind_count())
pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers])
post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers])
assert post_gc_layers.issubset(pre_evict_layers)
assert len(post_gc_layers) < len(pre_evict_layers)
log.info("update_gc_info might download some layers. Evict them again.")
ps_http.evict_all_layers(tenant_id, timeline_id)
log.info("after running GC, ensure that resident size is still zero")
ensure_resident_and_remote_size_metrics()