mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-26 23:59:58 +00:00
The walproposer pretends to be a walsender in many ways. It has a
WalSnd slot, it claims to be a walsender by calling
MarkPostmasterChildWalSender() etc. But one different to real
walsenders was that the postmaster still treated it as a bgworker
rather than a walsender. The difference is that at shutdown,
walsenders are not killed until the very end, after the checkpointer
process has written the shutdown checkpoint and exited.
As a result, the walproposer always got killed before the shutdown
checkpoint was written, so the shutdown checkpoint never made it to
safekeepers. That's fine in principle, we don't require a clean
shutdown after all. But it also feels a bit silly not to stream the
shutdown checkpoint. It could be useful for initializing hot standby
mode in a read replica, for example.
Change postmaster to treat background workers that have called
MarkPostmasterChildWalSender() as walsenders. That unfortunately
requires another small change in postgres core.
After doing that, walproposers stay alive longer. However, it also
means that the checkpointer will wait for the walproposer to switch to
WALSNDSTATE_STOPPING state, when the checkpointer sends the
PROCSIG_WALSND_INIT_STOPPING signal. We don't have the machinery in
walproposer to receive and handle that signal reliably. Instead, we
mark walproposer as being in WALSNDSTATE_STOPPING always.
In commit 568f91420a, I assumed that shutdown will wait for all the
remaining WAL to be streamed to safekeepers, but before this commit
that was not true, and the test became flaky. This should make it
stable again.
Some tests wrongly assumed that no WAL could have been written between
pg_current_wal_flush_lsn and quick pg stop after it. Fix them by introducing
flush_ep_to_pageserver which first stops the endpoint and then waits till all
committed WAL reaches the pageserver.
In passing extract safekeeper http client to its own module.
249 lines
10 KiB
Python
249 lines
10 KiB
Python
import time
|
|
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
|
|
from fixtures.pageserver.types import (
|
|
DeltaLayerFileName,
|
|
ImageLayerFileName,
|
|
is_future_layer,
|
|
)
|
|
from fixtures.pageserver.utils import (
|
|
wait_for_last_record_lsn,
|
|
wait_for_upload_queue_empty,
|
|
wait_until_tenant_active,
|
|
)
|
|
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
|
from fixtures.types import Lsn
|
|
from fixtures.utils import query_scalar, wait_until
|
|
|
|
|
|
def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
|
|
|
|
Create a situation where IndexPart contains an image layer from a future
|
|
(i.e., image layer > IndexPart::disk_consistent_lsn).
|
|
Detach.
|
|
Attach.
|
|
Wait for tenant to finish load_layer_map (by waiting for it to become active).
|
|
Wait for any remote timeline client ops to finish that the attach started.
|
|
Integrity-check the index part.
|
|
|
|
Before fixing the issue, load_layer_map would schedule removal of the future
|
|
image layer. A compaction run could later re-create the image layer with
|
|
the same file name, scheduling a PUT.
|
|
Due to lack of an upload queue barrier, the PUT and DELETE could be re-ordered.
|
|
The result was IndexPart referencing a non-existent object.
|
|
"""
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start()
|
|
env.pageserver.allowed_errors.extend(
|
|
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
|
)
|
|
|
|
ps_http = env.pageserver.http_client()
|
|
|
|
l0_l1_threshold = 3
|
|
image_creation_threshold = 1
|
|
|
|
tenant_config = {
|
|
"gc_period": "0s", # disable GC (shouldn't matter for this test but still)
|
|
"compaction_period": "0s", # we want to control when compaction runs
|
|
"checkpoint_timeout": "24h", # something we won't reach
|
|
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
|
|
"image_creation_threshold": "100", # we want to control when image is created
|
|
"compaction_threshold": f"{l0_l1_threshold}",
|
|
"compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
|
|
}
|
|
|
|
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
|
|
def get_index_part():
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
ip_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
|
|
return env.pagectl.dump_index_part(ip_path)
|
|
|
|
def get_future_layers():
|
|
ip = get_index_part()
|
|
future_layers = [
|
|
layer_file_name
|
|
for layer_file_name in ip.layer_metadata.keys()
|
|
if is_future_layer(layer_file_name, ip.disk_consistent_lsn)
|
|
]
|
|
return future_layers
|
|
|
|
assert len(get_future_layers()) == 0
|
|
|
|
current = get_index_part()
|
|
assert len(set(current.layer_metadata.keys())) == 1
|
|
layer_file_name = list(current.layer_metadata.keys())[0]
|
|
assert isinstance(layer_file_name, DeltaLayerFileName)
|
|
assert layer_file_name.is_l0(), f"{layer_file_name}"
|
|
|
|
log.info("force image layer creation in the future by writing some data into in-memory layer")
|
|
|
|
# Create a number of layers in the tenant
|
|
with endpoint.cursor() as cur:
|
|
cur.execute("CREATE TABLE foo (t text)")
|
|
iters = l0_l1_threshold * image_creation_threshold
|
|
for i in range(0, iters):
|
|
cur.execute(
|
|
f"""
|
|
INSERT INTO foo
|
|
SELECT '{i}' || g
|
|
FROM generate_series(1, 10000) g
|
|
"""
|
|
)
|
|
last_record_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
|
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
|
|
# 0..iters-1: create a stack of delta layers
|
|
# iters: leave a non-empty in-memory layer which we'll use for image layer generation
|
|
if i < iters - 1:
|
|
ps_http.timeline_checkpoint(tenant_id, timeline_id, force_repartition=True)
|
|
assert (
|
|
len(
|
|
[
|
|
layer
|
|
for layer in ps_http.layer_map_info(
|
|
tenant_id, timeline_id
|
|
).historic_layers
|
|
if layer.kind == "Image"
|
|
]
|
|
)
|
|
== 0
|
|
)
|
|
last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
|
|
|
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
|
|
|
|
ip = get_index_part()
|
|
assert len(ip.layer_metadata.keys())
|
|
assert (
|
|
ip.disk_consistent_lsn < last_record_lsn
|
|
), "sanity check for what above loop is supposed to do"
|
|
|
|
# create the image layer from the future
|
|
ps_http.patch_tenant_config_client_side(
|
|
tenant_id, {"image_creation_threshold": image_creation_threshold}, None
|
|
)
|
|
assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
|
|
ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
|
|
assert (
|
|
len(
|
|
[
|
|
layer
|
|
for layer in ps_http.layer_map_info(tenant_id, timeline_id).historic_layers
|
|
if layer.kind == "Image"
|
|
]
|
|
)
|
|
== 1
|
|
)
|
|
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
|
|
future_layers = get_future_layers()
|
|
assert len(future_layers) == 1
|
|
future_layer = future_layers[0]
|
|
assert isinstance(future_layer, ImageLayerFileName)
|
|
assert future_layer.lsn == last_record_lsn
|
|
log.info(
|
|
f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
|
|
)
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, future_layer.to_str()
|
|
)
|
|
log.info(f"future layer path: {future_layer_path}")
|
|
pre_stat = future_layer_path.stat()
|
|
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
|
|
|
|
def get_generation_number():
|
|
attachment = env.attachment_service.inspect(tenant_id)
|
|
assert attachment is not None
|
|
return attachment[0]
|
|
|
|
# force removal of layers from the future
|
|
tenant_conf = ps_http.tenant_config(tenant_id)
|
|
generation_before_detach = get_generation_number()
|
|
env.pageserver.tenant_detach(tenant_id)
|
|
failpoint_name = "before-delete-layer-pausable"
|
|
|
|
ps_http.configure_failpoints((failpoint_name, "pause"))
|
|
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
|
|
generation_after_reattach = get_generation_number()
|
|
wait_until_tenant_active(ps_http, tenant_id)
|
|
|
|
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
|
|
def future_layer_is_gone_from_index_part():
|
|
future_layers = set(get_future_layers())
|
|
assert future_layer not in future_layers
|
|
|
|
wait_until(10, 0.5, future_layer_is_gone_from_index_part)
|
|
|
|
# NB: the layer file is unlinked index part now, but, because we made the delete
|
|
# operation stuck, the layer file itself is still in the remote_storage
|
|
wait_until(
|
|
10,
|
|
0.5,
|
|
lambda: env.pageserver.assert_log_contains(
|
|
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
|
|
),
|
|
)
|
|
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
|
|
)
|
|
log.info(f"future layer path: {future_layer_path}")
|
|
assert future_layer_path.exists()
|
|
|
|
# wait for re-ingestion of the WAL from safekeepers into the in-memory layer
|
|
# (this happens in parallel to the above)
|
|
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
|
|
|
|
# re-do image layer generation
|
|
# This will produce the same image layer and queue an upload.
|
|
# However, we still have the deletion for the layer queued, stuck on the failpoint.
|
|
# An incorrect implementation would let the PUT execute before the DELETE.
|
|
# The later code in this test asserts that this doesn't happen.
|
|
ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
|
|
|
|
# Let things sit for some time; a good implementation makes no progress because
|
|
# we can't execute the PUT before the DELETE. A bad implementation would do that.
|
|
max_race_opportunity_window = 4
|
|
start = time.monotonic()
|
|
while True:
|
|
post_stat = future_layer_path.stat()
|
|
assert (
|
|
pre_stat.st_mtime == post_stat.st_mtime
|
|
), "observed PUT overtake the stucked DELETE => bug isn't fixed yet"
|
|
if time.monotonic() - start > max_race_opportunity_window:
|
|
log.info(
|
|
"a correct implementation would never let the later PUT overtake the earlier DELETE"
|
|
)
|
|
break
|
|
time.sleep(1)
|
|
|
|
# Window has passed, unstuck the delete, let upload queue drain.
|
|
log.info("unstuck the DELETE")
|
|
ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
|
|
|
|
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
|
|
|
|
# Examine the resulting S3 state.
|
|
log.info("integrity-check the remote storage")
|
|
ip = get_index_part()
|
|
for layer_file_name, layer_metadata in ip.layer_metadata.items():
|
|
log.info(f"Layer metadata {layer_file_name.to_str()}: {layer_metadata}")
|
|
layer_path = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, layer_file_name.to_str(), layer_metadata.generation
|
|
)
|
|
assert layer_path.exists(), f"{layer_file_name.to_str()}"
|
|
|
|
log.info("assert that the overwritten layer won")
|
|
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, future_layer.to_str(), generation=generation_after_reattach
|
|
)
|
|
final_stat = future_layer_path.stat()
|
|
log.info(f"future layer path: {future_layer_path}")
|
|
assert final_stat.st_mtime != pre_stat.st_mtime
|