timeline_detach_ancestor: adjust error handling (#8528)

With additional phases from #8430 the `detach_ancestor::Error` became
untenable. Split it up into phases, and introduce laundering for
remaining `anyhow::Error` to propagate them as most often
`Error::ShuttingDown`.

Additionally, complete FIXMEs.

Cc: #6994
This commit is contained in:
Joonas Koivunen
2024-08-14 12:16:18 +03:00
committed by GitHub
parent 4049d2b7e1
commit 485d76ac62
8 changed files with 347 additions and 204 deletions

View File

@@ -97,7 +97,7 @@ def test_ancestor_detach_branched_from(
client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
# there is also the in-mem layer, but ignore it for now
@@ -452,6 +452,9 @@ def test_compaction_induced_by_detaches_in_history(
}
)
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
env.pageserver.allowed_errors.append(
".*await_initial_logical_size: can't get semaphore cancel token, skipping"
)
client = env.pageserver.http_client()
def delta_layers(timeline_id: TimelineId):
@@ -524,6 +527,7 @@ def test_compaction_induced_by_detaches_in_history(
assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
skip_main = branches[1:]
branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
# take the fullbackup before and after inheriting the new L0s
@@ -532,6 +536,13 @@ def test_compaction_induced_by_detaches_in_history(
env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
)
# force initial logical sizes, so we can evict all layers from all
# timelines and exercise on-demand download for copy lsn prefix
client.timeline_detail(
env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True
)
client.evict_all_layers(env.initial_tenant, env.initial_timeline)
for _, timeline_id in skip_main:
reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
assert reparented == set(), "we have no earlier branches at any level"
@@ -705,7 +716,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
log.info(f"stuck pageserver is id={stuck.id}")
stuck_http = stuck.http_client()
stuck_http.configure_failpoints(
("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause")
)
restarted = pageservers[int(shards[1]["node_id"])]
@@ -716,7 +727,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
restarted_http = restarted.http_client()
restarted_http.configure_failpoints(
[
("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"),
("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"),
]
)
@@ -734,7 +745,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1)
stuck_http.configure_failpoints(
("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
)
barrier = threading.Barrier(2)
@@ -753,7 +764,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
# we have 10s, lets use 1/2 of that to help the shutdown start
time.sleep(5)
restarted_http.configure_failpoints(
("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
)
fut.result()
@@ -806,7 +817,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
after starting the detach.
What remains not tested by this:
- shutdown winning over complete
- shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
"""
if sharded and mode == "delete_tenant":
@@ -833,7 +844,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
detached_timeline = env.neon_cli.create_branch("detached soon", "main")
pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
env.storage_controller.reconcile_until_idle()
shards = env.storage_controller.locate(env.initial_tenant)
@@ -931,7 +942,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
_, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"])
def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str):
"""
Technically possible storage controller concurrent interleaving timeline
@@ -943,12 +954,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
must be detached.
"""
assert (
mode == "delete_reparentable_timeline"
), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
# perhaps it could be supported by always doing this for the shard0 first, and after that for others.
# when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
shard_count = 2
neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
@@ -980,14 +985,21 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
first_branch = env.neon_cli.create_branch(
"first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
)
def create_reparentable_timeline() -> TimelineId:
return env.neon_cli.create_branch(
"first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
)
if mode == "delete_reparentable_timeline":
first_branch = create_reparentable_timeline()
else:
first_branch = None
detached_branch = env.neon_cli.create_branch(
"detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn
)
pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
stuck = pageservers[int(shards[0]["node_id"])]
stuck_http = stuck.http_client().without_status_retrying()
@@ -999,12 +1011,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
(pausepoint, "pause"),
)
# noticed a surprising 409 if the other one would fail instead
# victim_http.configure_failpoints([
# (pausepoint, "pause"),
# ("timeline-detach-ancestor::before_starting_after_locking", "return"),
# ])
# interleaving a create_timeline which could be reparented will produce two
# permanently different reparentings: one node has reparented, other has
# not
@@ -1023,6 +1029,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
assert detail.get("ancestor_lsn") is None
def first_branch_gone():
assert first_branch is not None
try:
env.storage_controller.pageserver_api().timeline_detail(
env.initial_tenant, first_branch
@@ -1043,42 +1050,178 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
stuck_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_completed)
# if we would let victim fail, for some reason there'd be a 409 response instead of 500
# victim_http.configure_failpoints((pausepoint, "off"))
# with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc:
# fut.result()
# assert exc.value.status_code == 409
env.storage_controller.pageserver_api().timeline_delete(
env.initial_tenant, first_branch
)
victim_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_branch_gone)
if mode == "delete_reparentable_timeline":
assert first_branch is not None
env.storage_controller.pageserver_api().timeline_delete(
env.initial_tenant, first_branch
)
victim_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_branch_gone)
elif mode == "create_reparentable_timeline":
first_branch = create_reparentable_timeline()
victim_http.configure_failpoints((pausepoint, "off"))
else:
raise RuntimeError("{mode}")
# it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent
fut.result()
mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required."
with pytest.raises(PageserverApiException, match=mixed_results):
fut.result()
msg, offset = env.storage_controller.assert_log_contains(
".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*"
)
log.info(f"expected error message: {msg}")
env.storage_controller.allowed_errors.append(
".*: shards returned different results matching=0 .*"
log.info(f"expected error message: {msg.rstrip()}")
env.storage_controller.allowed_errors.extend(
[
".*: shards returned different results matching=0 .*",
f".*: InternalServerError\\({mixed_results}",
]
)
detach_timeline()
if mode == "create_reparentable_timeline":
with pytest.raises(PageserverApiException, match=mixed_results):
detach_timeline()
else:
# it is a bit shame to flag it and then it suceeds, but most
# likely there would be a retry loop which would take care of
# this in cplane
detach_timeline()
# FIXME: perhaps the above should be automatically retried, if we get mixed results?
not_found = env.storage_controller.log_contains(
retried = env.storage_controller.log_contains(
".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
offset=offset,
offset,
)
assert not_found is None
if mode == "delete_reparentable_timeline":
assert (
retried is None
), "detaching should had converged after both nodes saw the deletion"
elif mode == "create_reparentable_timeline":
assert retried is not None, "detaching should not have converged"
_, offset = retried
finally:
stuck_http.configure_failpoints((pausepoint, "off"))
victim_http.configure_failpoints((pausepoint, "off"))
if mode == "create_reparentable_timeline":
assert first_branch is not None
# now we have mixed ancestry
assert (
TimelineId(
stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[
"ancestor_timeline_id"
]
)
== env.initial_timeline
)
assert (
TimelineId(
victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
"ancestor_timeline_id"
]
)
== detached_branch
)
# make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens
# if the ancestor would be deleted, we would partially fail, making deletion stuck.
env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch)
# and we should now have good results
not_found = env.storage_controller.log_contains(
".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
offset,
)
assert not_found is None
assert (
stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"]
is None
)
assert (
victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
"ancestor_timeline_id"
]
is None
)
def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
neon_env_builder: NeonEnvBuilder,
):
shard_count = 2
neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
for ps in env.pageservers:
ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
pageservers = dict((int(p.id), p) for p in env.pageservers)
env.storage_controller.reconcile_until_idle()
shards = env.storage_controller.locate(env.initial_tenant)
assert len(set(x["node_id"] for x in shards)) == shard_count
detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main")
pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
failpoint = "timeline-detach-ancestor::before_starting_after_locking"
stuck = pageservers[int(shards[0]["node_id"])]
stuck_http = stuck.http_client().without_status_retrying()
stuck_http.configure_failpoints(
(pausepoint, "pause"),
)
env.storage_controller.allowed_errors.append(
f".*Error processing HTTP request: .* failpoint: {failpoint}"
)
http = env.storage_controller.pageserver_api()
victim = pageservers[int(shards[-1]["node_id"])]
victim.allowed_errors.append(
f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}"
)
victim_http = victim.http_client().without_status_retrying()
victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")])
def detach_timeline():
http.detach_ancestor(env.initial_tenant, detached_branch)
def paused_at_failpoint():
stuck.assert_log_contains(f"at failpoint {pausepoint}")
victim.assert_log_contains(f"at failpoint {pausepoint}")
def first_completed():
detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch)
log.info(detail)
assert detail.get("ancestor_lsn") is None
with ThreadPoolExecutor(max_workers=1) as pool:
try:
fut = pool.submit(detach_timeline)
wait_until(10, 1.0, paused_at_failpoint)
# let stuck complete
stuck_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_completed)
victim_http.configure_failpoints((pausepoint, "off"))
with pytest.raises(
PageserverApiException,
match=f".*failpoint: {failpoint}",
) as exc:
fut.result()
assert exc.value.status_code == 500
finally:
stuck_http.configure_failpoints((pausepoint, "off"))
victim_http.configure_failpoints((pausepoint, "off"))
victim_http.configure_failpoints((failpoint, "off"))
detach_timeline()
def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
"""
@@ -1169,7 +1312,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
match=".*failed to reparent all candidate timelines, please retry",
) as exc:
http.detach_ancestor(env.initial_tenant, detached)
assert exc.value.status_code == 500
assert exc.value.status_code == 503
# first round -- do more checking to make sure the gc gets paused
try_detach()
@@ -1323,14 +1466,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
http.configure_failpoints((failpoint, "off"))
with pytest.raises(PageserverApiException) as exc:
with pytest.raises(
PageserverApiException, match="NotFound: Timeline .* was not found"
) as exc:
detach.result()
# FIXME: this should be 404 but because there is another Anyhow conversion it is 500
assert exc.value.status_code == 500
env.pageserver.allowed_errors.append(
".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
)
assert exc.value.status_code == 404
finally:
http.configure_failpoints((failpoint, "off"))