fix(test): use layer map dump in test_readonly_node_gc to validate layers protected by leases (#9551)

Fixes #9518.

## Problem

After removing the assertion `layers_removed == 0` in #9506, we could
miss breakage if we solely rely on the successful execution of the
`SELECT` query to check if lease is properly protecting layers. Details
listed in #9518.

Also, in integration tests, we sometimes run into the race condition
where getpage request comes before the lease get renewed (item 2 of
#8817), even if compute_ctl sends a lease renewal as soon as it sees a
`/configure` API calls that updates the `pageserver_connstr`. In this
case, we would observe a getpage request error stating that we `tried to
request a page version that was garbage collected` (as we seen in
[Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8613/11550393107/index.html#suites/3ccffb1d100105b98aed3dc19b717917/d1a1ba47bc180493)).

## Summary of changes

- Use layer map dump to verify if the lease protects what it claimed:
Record all historical layers that has `start_lsn <= lease_lsn` before
and after running timeline gc. This is the same check as
ad79f42460/pageserver/src/tenant/timeline.rs (L5025-L5027)
The set recorded after GC should contain every layer in the set recorded
before GC.
- Wait until log contains another successful lease request before
running the `SELECT` query after GC. We argued in #8817 that the bad
request can only exist within a short period after migration/restart,
and our test shows that as long as a lease renewal is done before the
first getpage request sent after reconfiguration, we will not have bad
request.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
Yuchen Liang
2024-11-06 15:18:21 -05:00
committed by GitHub
parent 93123f2623
commit 11fc1a4c12

View File

@@ -1,19 +1,22 @@
from __future__ import annotations
import time
from typing import Union
import pytest
from fixtures.common_types import Lsn
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
LogCursor,
NeonEnv,
NeonEnvBuilder,
last_flush_lsn_upload,
tenant_get_shards,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.utils import query_scalar
from fixtures.utils import query_scalar, wait_until
#
@@ -169,23 +172,63 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
)
return last_flush_lsn
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
def get_layers_protected_by_lease(
ps_http: PageserverHttpClient,
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
lease_lsn: Lsn,
) -> set[str]:
"""Get all layers whose start_lsn is less than or equal to the lease lsn."""
layer_map_info = ps_http.layer_map_info(tenant_id, timeline_id)
return set(
x.layer_file_name
for x in layer_map_info.historic_layers
if Lsn(x.lsn_start) <= lease_lsn
)
def trigger_gc_and_select(
env: NeonEnv,
ep_static: Endpoint,
lease_lsn: Lsn,
ctx: str,
offset: None | LogCursor = None,
) -> LogCursor:
"""
Trigger GC manually on all pageservers. Then run an `SELECT` query.
"""
for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client()
layers_guarded_before_gc = get_layers_protected_by_lease(
client, shard, env.initial_timeline, lease_lsn=lsn
)
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
layers_guarded_after_gc = get_layers_protected_by_lease(
client, shard, env.initial_timeline, lease_lsn=lsn
)
# Note: cannot assert on `layers_removed` here because it could be layers
# not guarded by the lease. Rely on successful execution of the query instead.
# not guarded by the lease. Instead, use layer map dump.
assert layers_guarded_before_gc.issubset(
layers_guarded_after_gc
), "Layers guarded by lease before GC should not be removed"
log.info(f"{gc_result=}")
# wait for lease renewal before running query.
_, offset = wait_until(
20,
0.5,
lambda: ep_static.assert_log_contains(
"lsn_lease_bg_task.*Request succeeded", offset=offset
),
)
with ep_static.cursor() as cur:
# Following query should succeed if pages are properly guarded by leases.
cur.execute("SELECT count(*) FROM t0")
assert cur.fetchone() == (ROW_COUNT,)
log.info(f"`SELECT` query succeed after GC, {ctx=}")
return offset
# Insert some records on main branch
with env.endpoints.create_start("main") as ep_main:
@@ -213,7 +256,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
generate_updates_on_main(env, ep_main, 3, end=100)
trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
offset = trigger_gc_and_select(
env, ep_static, lease_lsn=lsn, ctx="Before pageservers restart"
)
# Trigger Pageserver restarts
for ps in env.pageservers:
@@ -222,7 +267,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
time.sleep(LSN_LEASE_LENGTH / 2)
ps.start()
trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
trigger_gc_and_select(
env,
ep_static,
lease_lsn=lsn,
ctx="After pageservers restart",
offset=offset,
)
# Reconfigure pageservers
env.pageservers[0].stop()
@@ -231,7 +282,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
)
env.storage_controller.reconcile_until_idle()
trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
trigger_gc_and_select(
env,
ep_static,
lease_lsn=lsn,
ctx="After putting pageserver 0 offline",
offset=offset,
)
# Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100)