mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
Problem ------- Tests that directly call the Pageserver Management API to set tenant config are flaky if the Pageserver is managed by Storcon because Storcon is the source of truth and may (theoretically) reconcile a tenant at any time. Solution -------- Switch all users of `set_tenant_config`/`patch_tenant_config_client_side` to use the `env.storage_controller.pageserver_api()` Future Work ----------- Prevent regressions from creeping in. And generally clean up up tenant configuration. Maybe we can avoid the Pageserver having a default tenant config at all and put the default into Storcon instead? * => https://github.com/neondatabase/neon/issues/9621 Refs ---- fixes https://github.com/neondatabase/neon/issues/9522
113 lines
4.0 KiB
Python
113 lines
4.0 KiB
Python
from __future__ import annotations
|
|
|
|
import pytest
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
|
from fixtures.pageserver.common_types import ImageLayerName, parse_layer_file_name
|
|
from fixtures.pageserver.utils import (
|
|
wait_for_last_record_lsn,
|
|
wait_until_tenant_active,
|
|
)
|
|
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
|
from requests.exceptions import ConnectionError
|
|
|
|
|
|
def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
|
"""
|
|
Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md.
|
|
|
|
Simulate crash after compaction has written layers to disk
|
|
but before they have been uploaded/linked into remote index_part.json.
|
|
|
|
Startup handles this situation by deleting the not yet uploaded L1 layer files.
|
|
"""
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf={
|
|
"checkpoint_distance": f"{10 * 1024**2}",
|
|
"compaction_period": "0 s",
|
|
"compaction_threshold": "999999",
|
|
}
|
|
)
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
|
|
|
|
pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit"))
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
connstr = endpoint.connstr(options="-csynchronous_commit=off")
|
|
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
|
|
|
|
lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
|
|
|
# make sure we receive no new wal after this, so that we'll write over the same L1 file.
|
|
endpoint.stop()
|
|
for sk in env.safekeepers:
|
|
sk.stop()
|
|
|
|
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
|
tenant_id, {"compaction_threshold": 3}
|
|
)
|
|
# hit the exit failpoint
|
|
with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
|
|
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
|
env.pageserver.stop()
|
|
|
|
# now the duplicate L1 has been created, but is not yet uploaded
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
|
|
# path = env.remote_storage.timeline_path(tenant_id, timeline_id)
|
|
l1_found = None
|
|
for path in env.pageserver.list_layers(tenant_id, timeline_id):
|
|
[key_range, lsn_range] = path.name.split("__", maxsplit=1)
|
|
|
|
if "-" not in lsn_range:
|
|
# image layer
|
|
continue
|
|
|
|
[key_start, key_end] = key_range.split("-", maxsplit=1)
|
|
|
|
if key_start == "0" * 36 and key_end == "F" * 36:
|
|
# L0
|
|
continue
|
|
|
|
candidate = parse_layer_file_name(path.name)
|
|
|
|
if isinstance(candidate, ImageLayerName):
|
|
continue
|
|
|
|
if l1_found is not None:
|
|
raise RuntimeError(f"found multiple L1: {l1_found.to_str()} and {path.name}")
|
|
|
|
l1_found = candidate
|
|
|
|
assert l1_found is not None, "failed to find L1 locally"
|
|
|
|
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, l1_found.to_str()
|
|
)
|
|
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
|
|
|
|
env.pageserver.start()
|
|
wait_until_tenant_active(pageserver_http, tenant_id)
|
|
|
|
assert not env.pageserver.layer_exists(
|
|
tenant_id, timeline_id, l1_found
|
|
), "partial compaction result should had been removed during startup"
|
|
|
|
# wait for us to catch up again
|
|
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
|
|
|
|
pageserver_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
|
|
|
|
assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
|
|
|
|
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
|
tenant_id, timeline_id, l1_found.to_str()
|
|
)
|
|
assert uploaded.exists(), "the L1 is uploaded"
|
|
|
|
|
|
# TODO: same test for L0s produced by ingest.
|