mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
## Problem 1. During the rollout we got a panic: "timeline that we were deleting was concurrently removed from 'timelines' map" that was caused by lock guard not being propagated to the background part of the deletion. Existing test didnt catch it because failpoint that was used for verification was placed earlier prior to background task spawning. 2. When looking at surrounding code one more bug was detected. We removed timeline from the map before deletion is finished, which breaks client retry logic, because it will indicate 404 before actual deletion is completed which can lead to client stopping its retry poll earlier. ## Summary of changes 1. Carry the lock guard over to background deletion. Ensure existing test case fails without applied patch (second deletion becomes stuck without it, which eventually leads to a test failure). 2. Move delete_all call earlier so timeline is removed from the map is the last thing done during deletion. Additionally I've added timeline_id to the `update_gc_info` span, because `debug_assert_current_span_has_tenant_and_timeline_id` in `download_remote_layer` was firing when `update_gc_info` lead to on-demand downloads via `find_lsn_for_timestamp` (caught by @problame). This is not directly related to the PR but fixes possible flakiness. Another smaller set of changes involves deletion wrapper used in python tests. Now there is a simpler wrapper that waits for deletions to complete `timeline_delete_wait_completed`. Most of the test_delete_timeline.py tests make negative tests, i.e., "does ps_http.timeline_delete() fail in this and that scenario". These can be left alone. Other places when we actually do the deletions, we need to use the helper that polls for completion. Discussion https://neondb.slack.com/archives/C03F5SM1N02/p1686668007396639 resolves #4496 --------- Co-authored-by: Christian Schwarz <christian@neon.tech>
79 lines
2.9 KiB
Python
79 lines
2.9 KiB
Python
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
from fixtures.pageserver.utils import (
|
|
assert_tenant_state,
|
|
timeline_delete_wait_completed,
|
|
wait_until_tenant_active,
|
|
)
|
|
from fixtures.types import TenantId, TimelineId
|
|
from fixtures.utils import wait_until
|
|
|
|
|
|
def get_only_element(l): # noqa: E741
|
|
assert len(l) == 1
|
|
return l[0]
|
|
|
|
|
|
# Test that gc and compaction tenant tasks start and stop correctly
|
|
def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
|
name = "test_tenant_tasks"
|
|
env = neon_env_builder.init_start()
|
|
client = env.pageserver.http_client()
|
|
|
|
def get_state(tenant):
|
|
all_states = client.tenant_list()
|
|
matching = [t for t in all_states if TenantId(t["id"]) == tenant]
|
|
return get_only_element(matching)["state"]
|
|
|
|
def delete_all_timelines(tenant: TenantId):
|
|
timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
|
|
for t in timelines:
|
|
timeline_delete_wait_completed(client, tenant, t)
|
|
|
|
# Create tenant, start compute
|
|
tenant, _ = env.neon_cli.create_tenant()
|
|
env.neon_cli.create_timeline(name, tenant_id=tenant)
|
|
endpoint = env.endpoints.create_start(name, tenant_id=tenant)
|
|
assert_tenant_state(
|
|
client,
|
|
tenant,
|
|
expected_state="Active",
|
|
message="Pageserver should activate a tenant and start background jobs if timelines are loaded",
|
|
)
|
|
|
|
# Stop compute
|
|
endpoint.stop()
|
|
|
|
# Delete all timelines on all tenants.
|
|
#
|
|
# FIXME: we used to check that the background jobs are stopped when all timelines
|
|
# are removed, but we don't stop them anymore. Not sure if this test still makes sense
|
|
# or we should just remove it.
|
|
for tenant_info in client.tenant_list():
|
|
tenant_id = TenantId(tenant_info["id"])
|
|
delete_all_timelines(tenant_id)
|
|
wait_until_tenant_active(client, tenant_id, iterations=10, period=0.2)
|
|
|
|
# Assert that all tasks finish quickly after tenant is detached
|
|
task_starts = client.get_metric_value("pageserver_tenant_task_events_total", {"event": "start"})
|
|
assert task_starts is not None
|
|
assert int(task_starts) > 0
|
|
client.tenant_detach(tenant)
|
|
client.tenant_detach(env.initial_tenant)
|
|
|
|
def assert_tasks_finish():
|
|
tasks_started = client.get_metric_value(
|
|
"pageserver_tenant_task_events_total", {"event": "start"}
|
|
)
|
|
tasks_ended = client.get_metric_value(
|
|
"pageserver_tenant_task_events_total", {"event": "stop"}
|
|
)
|
|
tasks_panicked = client.get_metric_value(
|
|
"pageserver_tenant_task_events_total", {"event": "panic"}
|
|
)
|
|
log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
|
|
assert tasks_started == tasks_ended
|
|
assert tasks_panicked is None or int(tasks_panicked) == 0
|
|
|
|
wait_until(10, 0.2, assert_tasks_finish)
|