mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 05:20:38 +00:00
Instead of spawning helper threads, we now use Tokio tasks. There are multiple Tokio runtimes, for different kinds of tasks. One for serving libpq client connections, another for background operations like GC and compaction, and so on. That's not strictly required, we could use just one runtime, but with this you can still get an overview of what's happening with "top -H". There's one subtle behavior in how TenantState is updated. Before this patch, if you deleted all timelines from a tenant, its GC and compaction loops were stopped, and the tenant went back to Idle state. We no longer do that. The empty tenant stays Active. The changes to test_tenant_tasks.py are related to that. There's still plenty of synchronous code and blocking. For example, we still use blocking std::io functions for all file I/O, and the communication with WAL redo processes is still uses low-level unix poll(). We might want to rewrite those later, but this will do for now. The model is that local file I/O is considered to be fast enough that blocking - and preventing other tasks running in the same thread - is acceptable.
71 lines
2.8 KiB
Python
71 lines
2.8 KiB
Python
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, wait_until
|
|
from fixtures.types import ZTenantId, ZTimelineId
|
|
|
|
|
|
def get_only_element(l): # noqa: E741
|
|
assert len(l) == 1
|
|
return l[0]
|
|
|
|
|
|
# Test that gc and compaction tenant tasks start and stop correctly
|
|
def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
|
# The gc and compaction loops don't bother to watch for tenant state
|
|
# changes while sleeping, so we use small periods to make this test
|
|
# run faster. With default settings we'd have to wait longer for tasks
|
|
# to notice state changes and shut down.
|
|
# TODO fix this behavior in the pageserver
|
|
tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}"
|
|
neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}"
|
|
name = "test_tenant_tasks"
|
|
env = neon_env_builder.init_start()
|
|
client = env.pageserver.http_client()
|
|
|
|
def get_state(tenant):
|
|
all_states = client.tenant_list()
|
|
matching = [t for t in all_states if ZTenantId(t["id"]) == tenant]
|
|
return get_only_element(matching)["state"]
|
|
|
|
def get_metric_value(name):
|
|
metrics = client.get_metrics()
|
|
relevant = [line for line in metrics.splitlines() if line.startswith(name)]
|
|
if len(relevant) == 0:
|
|
return 0
|
|
line = get_only_element(relevant)
|
|
value = line.lstrip(name).strip()
|
|
return int(value)
|
|
|
|
def delete_all_timelines(tenant: ZTenantId):
|
|
timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
|
|
for t in timelines:
|
|
client.timeline_delete(tenant, t)
|
|
|
|
# Create tenant, start compute
|
|
tenant, _ = env.neon_cli.create_tenant()
|
|
env.neon_cli.create_timeline(name, tenant_id=tenant)
|
|
pg = env.postgres.create_start(name, tenant_id=tenant)
|
|
assert get_state(tenant) == "Active"
|
|
|
|
# Stop compute
|
|
pg.stop()
|
|
|
|
# Delete all timelines on all tenants
|
|
for tenant_info in client.tenant_list():
|
|
tenant_id = ZTenantId(tenant_info["id"])
|
|
delete_all_timelines(tenant_id)
|
|
|
|
# Assert that all tasks finish quickly after tenant is detached
|
|
assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0
|
|
client.tenant_detach(tenant)
|
|
client.tenant_detach(env.initial_tenant)
|
|
|
|
def assert_tasks_finish():
|
|
tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
|
|
tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
|
|
tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
|
|
log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
|
|
assert tasks_started == tasks_ended
|
|
assert tasks_panicked == 0
|
|
|
|
wait_until(10, 0.2, assert_tasks_finish)
|