storage controller: background reconcile, graceful shutdown, better logging (#6709)

## Problem

Now that the storage controller is working end to end, we start burning
down the robustness aspects.

## Summary of changes

- Add a background task that periodically calls `reconcile_all`. This
ensures that if earlier operations couldn't succeed (e.g. because a node
was unavailable), we will eventually retry. This is a naive initial
implementation can start an unlimited number of reconcile tasks:
limiting reconcile concurrency is a later item in #6342
- Add a number of tracing spans in key locations: each background task,
each reconciler task.
- Add a top level CancellationToken and Gate, and use these to implement
a graceful shutdown that waits for tasks to shut down. This is not
bulletproof yet, because within these tasks we have remote HTTP calls
that aren't wrapped in cancellation/timeouts, but it creates the
structure, and if we don't shutdown promptly then k8s will kill us.
- To protect shard splits from background reconciliation, expose the `SplitState`
in memory and use it to guard any APIs that require an attached tenant.
This commit is contained in:
John Spray
2024-02-16 13:00:53 +00:00
committed by GitHub
parent 568bc1fde3
commit f2e5212fed
9 changed files with 370 additions and 156 deletions

View File

@@ -20,6 +20,7 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
NeonPageserver,
PgBin,
S3Scrubber,
last_flush_lsn_upload,
@@ -62,7 +63,7 @@ def generate_uploads_and_deletions(
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
data: Optional[str] = None,
pageserver_id: Optional[int] = None,
pageserver: NeonPageserver,
):
"""
Using the environment's default tenant + timeline, generate a load pattern
@@ -77,14 +78,16 @@ def generate_uploads_and_deletions(
timeline_id = env.initial_timeline
assert timeline_id is not None
ps_http = env.pageserver.http_client()
ps_http = pageserver.http_client()
with env.endpoints.create_start(
"main", tenant_id=tenant_id, pageserver_id=pageserver_id
"main", tenant_id=tenant_id, pageserver_id=pageserver.id
) as endpoint:
if init:
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
last_flush_lsn_upload(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
)
def churn(data):
endpoint.safe_psql_many(
@@ -105,7 +108,9 @@ def generate_uploads_and_deletions(
# We are waiting for uploads as well as local flush, in order to avoid leaving the system
# in a state where there are "future layers" in remote storage that will generate deletions
# after a restart.
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
last_flush_lsn_upload(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Compaction should generate some GC-elegible layers
@@ -205,7 +210,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
)
generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
generate_uploads_and_deletions(env, pageserver=env.pageserver)
def parse_generation_suffix(key):
m = re.match(".+-([0-9a-zA-Z]{8})$", key)
@@ -233,7 +238,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
# Starting without the override that disabled control_plane_api
env.pageserver.start()
generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False)
generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
legacy_objects: list[str] = []
suffixed_objects = []
@@ -277,13 +282,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
some_other_pageserver = 1234
attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
main_pageserver = env.get_pageserver(attached_to_id)
other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
ps_http = env.pageserver.http_client()
ps_http = main_pageserver.http_client()
generate_uploads_and_deletions(env)
generate_uploads_and_deletions(env, pageserver=main_pageserver)
# Flush: pending deletions should all complete
assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -296,14 +304,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
env.pageserver.allowed_errors.extend(
main_pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
assert_deletion_queue(ps_http, lambda n: n > 0)
queue_depth_before = get_deletion_queue_depth(ps_http)
@@ -355,9 +363,14 @@ def test_deletion_queue_recovery(
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
ps_http = env.pageserver.http_client()
attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
main_pageserver = env.get_pageserver(attached_to_id)
other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
ps_http = main_pageserver.http_client()
failpoints = [
# Prevent deletion lists from being executed, to build up some backlog of deletions
@@ -374,7 +387,7 @@ def test_deletion_queue_recovery(
ps_http.configure_failpoints(failpoints)
generate_uploads_and_deletions(env)
generate_uploads_and_deletions(env, pageserver=main_pageserver)
# There should be entries in the deletion queue
assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -401,7 +414,7 @@ def test_deletion_queue_recovery(
# also wait to see the header hit the disk: this seems paranoid but the race
# can really happen on a heavily overloaded test machine.
def assert_header_written():
assert (env.pageserver.workdir / "deletion" / "header-01").exists()
assert (main_pageserver.workdir / "deletion" / "header-01").exists()
wait_until(20, 1, assert_header_written)
@@ -411,13 +424,13 @@ def test_deletion_queue_recovery(
before_restart_depth = get_deletion_queue_validated(ps_http)
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
main_pageserver.stop(immediate=True)
if keep_attachment == KeepAttachment.LOSE:
some_other_pageserver = 101010
some_other_pageserver = other_pageserver.id
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
env.pageserver.start()
main_pageserver.start()
def assert_deletions_submitted(n: int):
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
@@ -440,7 +453,7 @@ def test_deletion_queue_recovery(
# validated before restart.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
@@ -449,8 +462,8 @@ def test_deletion_queue_recovery(
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
# Restart again
env.pageserver.stop(immediate=True)
env.pageserver.start()
main_pageserver.stop(immediate=True)
main_pageserver.start()
# No deletion lists should be recovered: this demonstrates that deletion lists
# were cleaned up after being executed or dropped in the previous process lifetime.
@@ -469,7 +482,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
ps_http = env.pageserver.http_client()
generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
generate_uploads_and_deletions(env, pageserver=env.pageserver)
env.pageserver.allowed_errors.extend(
[
@@ -486,7 +499,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
# Remember how many validations had happened before the control plane went offline
validated = get_deletion_queue_validated(ps_http)
generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
# The running pageserver should stop progressing deletions
time.sleep(10)
@@ -502,7 +515,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
)
# The pageserver should provide service to clients
generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
# The pageserver should neither validate nor execute any deletions, it should have
# loaded the DeletionLists from before though
@@ -523,7 +536,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
env.pageserver.start()
generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
ps_http.deletion_queue_flush(execute=True)
assert get_deletion_queue_depth(ps_http) == 0
assert get_deletion_queue_validated(ps_http) > 0
@@ -561,7 +574,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
generate_uploads_and_deletions(env)
generate_uploads_and_deletions(env, pageserver=env.pageserver)
read_all(env, tenant_id, timeline_id)
evict_all_layers(env, tenant_id, timeline_id)