Files
neon/test_runner/regress/test_timeline_archive.py
Arpad Müller 4c7956fa56 Fix hang deleting offloaded timelines (#12366)
We don't have cancellation support for timeline deletions. In other
words, timeline deletion might still go on in an older generation while
we are attaching it in a newer generation already, because the
cancellation simply hasn't reached the deletion code.

This has caused us to hit a situation with offloaded timelines in which
the timeline was in an unrecoverable state: always returning an accepted
response, but never a 404 like it should be.

The detailed description can be found in
[here](https://github.com/neondatabase/cloud/issues/30406#issuecomment-3008667859)
(private repo link).

TLDR:

1. we ask to delete timeline on old pageserver/generation, starts
process in background
2. the storcon migrates the tenant to a different pageserver.
- during attach, the pageserver still finds an index part, so it adds it
to `offloaded_timelines`
4. the timeline deletion finishes, removing the index part in S3
5. there is a retry of the timeline deletion endpoint, sent to the new
pageserver location. it is bound to fail however:
- as the index part is gone, we print `Timeline already deleted in
remote storage`.
- the problem is that we then return an accepted response code, and not
a 404.
- this confuses the code calling us. it thinks the timeline is not
deleted, so keeps retrying.
- this state never gets recovered from until a reset/detach, because of
the `offloaded_timelines` entry staying there.

This is where this PR fixes things: if no index part can be found, we
can safely assume that the timeline is gone in S3 (it's the last thing
to be deleted), so we can remove it from `offloaded_timelines` and
trigger a reupload of the manifest. Subsequent retries will pick that
up.

Why not improve the cancellation support? It is a more disruptive code
change, that might have its own risks. So we don't do it for now.

Fixes https://github.com/neondatabase/cloud/issues/30406
2025-06-27 15:14:55 +00:00

1237 lines
48 KiB
Python

from __future__ import annotations
import json
import random
import threading
import time
from typing import TYPE_CHECKING
import pytest
import requests
from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
last_flush_lsn_upload,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_prefix_empty,
assert_prefix_not_empty,
list_prefix,
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import S3Storage, s3_storage
from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until
from psycopg2.errors import IoError, UndefinedTable
if TYPE_CHECKING:
from mypy_boto3_s3.type_defs import (
ObjectTypeDef,
)
@pytest.mark.parametrize("shard_count", [0, 4])
def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
unsharded = shard_count == 0
if unsharded:
env = neon_env_builder.init_start()
# If we run the unsharded version, talk to the pageserver directly
ps_http = env.pageserver.http_client()
else:
neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
# If we run the unsharded version, talk to the storage controller
ps_http = env.storage_controller.pageserver_api()
for ps in env.pageservers:
# We make /archival_config requests that are intended to fail.
# It's expected that storcon drops requests to other pageservers after
# it gets the first error (https://github.com/neondatabase/neon/issues/11177)
ps.allowed_errors.extend(
[
".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing",
".*ERROR.* path=/v1/tenant/.*/archival_config .*Cancelled request finished with an error.*",
]
)
# first try to archive a non existing timeline for an existing tenant:
invalid_timeline_id = TimelineId.generate()
with pytest.raises(PageserverApiException, match="timeline not found") as exc:
ps_http.timeline_archival_config(
env.initial_tenant,
invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 404
# for a non existing tenant:
invalid_tenant_id = TenantId.generate()
with pytest.raises(
PageserverApiException,
match="NotFound: [tT]enant",
) as exc:
ps_http.timeline_archival_config(
invalid_tenant_id,
invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 404
# construct a pair of branches to validate that pageserver prohibits
# archival of ancestor timelines when they have non-archived child branches
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent")
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1",
ancestor_branch_name="test_ancestor_branch_archive_parent",
)
with pytest.raises(
PageserverApiException,
match="Cannot archive timeline which has non-archived child timelines",
) as exc:
ps_http.timeline_archival_config(
env.initial_tenant,
parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 412
leaf_detail = ps_http.timeline_detail(
env.initial_tenant,
timeline_id=leaf_timeline_id,
)
assert leaf_detail["is_archived"] is False
# Test that archiving the leaf timeline and then the parent works
ps_http.timeline_archival_config(
env.initial_tenant,
leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
env.initial_tenant,
leaf_timeline_id,
)
assert leaf_detail["is_archived"] is True
ps_http.timeline_archival_config(
env.initial_tenant,
parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
# Test that the leaf can't be unarchived
with pytest.raises(
PageserverApiException,
match="ancestor is archived",
) as exc:
ps_http.timeline_archival_config(
env.initial_tenant,
leaf_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
# Unarchive works for the leaf if the parent gets unarchived first
ps_http.timeline_archival_config(
env.initial_tenant,
parent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
ps_http.timeline_archival_config(
env.initial_tenant,
leaf_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
@pytest.mark.parametrize("manual_offload", [False, True])
def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
if manual_offload:
# (automatic) timeline offloading defaults to true
neon_env_builder.pageserver_config_override = "timeline_offloading = false"
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, initial_timeline_id = env.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "0s" if manual_offload else "1s",
}
)
# Create three branches that depend on each other, starting with two
grandparent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_grandparent", tenant_id
)
parent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
)
# write some stuff to the parent
with env.endpoints.create_start(
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,1000)",
]
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
# create the third branch
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
)
offloaded_count = ps_http.get_metric_value(
"pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
)
assert offloaded_count == 0
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id,
leaf_timeline_id,
)
assert leaf_detail["is_archived"] is True
ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
ps_http.timeline_archival_config(
tenant_id,
grandparent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
return (
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
is not None
)
if manual_offload:
with pytest.raises(
PageserverApiException,
match="timeline has attached children",
):
# This only tests the (made for testing only) http handler,
# but still demonstrates the constraints we have.
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
def parent_offloaded():
if manual_offload:
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
assert timeline_offloaded_logged(parent_timeline_id)
def leaf_offloaded():
if manual_offload:
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
assert timeline_offloaded_logged(leaf_timeline_id)
wait_until(leaf_offloaded)
wait_until(parent_offloaded)
offloaded_count = ps_http.get_metric_value(
"pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
)
assert offloaded_count == 2
# Offloaded child timelines should still prevent deletion
with pytest.raises(
PageserverApiException,
match=f".* timeline which has child timelines: \\[{leaf_timeline_id}\\]",
):
ps_http.timeline_delete(tenant_id, parent_timeline_id)
ps_http.timeline_archival_config(
tenant_id,
grandparent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
parent_detail = ps_http.timeline_detail(
tenant_id,
parent_timeline_id,
)
assert parent_detail["is_archived"] is False
with env.endpoints.create_start(
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
assert sum == sum_again
# Test that deletion of offloaded timelines works
ps_http.timeline_delete(tenant_id, leaf_timeline_id)
assert not timeline_offloaded_logged(initial_timeline_id)
@pytest.mark.parametrize("delete_timeline", [False, True])
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
"""
Test for persistence of timeline offload state
"""
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, root_timeline_id = env.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "0s",
"checkpoint_distance": f"{1024**2}",
}
)
# Create a branch and archive it
child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,2048)",
]
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/",
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
)
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert leaf_detail["is_archived"] is True
def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
# TODO add a proper API to check if a timeline has been offloaded or not
return not any(
timeline["timeline_id"] == str(timeline_id)
for timeline in ps_http.timeline_list(tenant_id=tenant_id)
)
def child_offloaded():
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
assert timeline_offloaded_api(child_timeline_id)
wait_until(child_offloaded)
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
)
# Test persistence, is the timeline still offloaded?
env.pageserver.stop()
env.pageserver.start()
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)
if delete_timeline:
ps_http.timeline_delete(tenant_id, child_timeline_id)
with pytest.raises(PageserverApiException, match="not found"):
ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
else:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
child_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert child_detail["is_archived"] is False
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
assert sum == sum_again
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
)
assert not timeline_offloaded_api(root_timeline_id)
ps_http.tenant_delete(tenant_id)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/",
)
@run_only_on_default_postgres("this test isn't sensitive to the contents of timelines")
@skip_in_debug_build("times out in debug builds")
def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
"""
A general consistency check on archival/offload timeline state, and its intersection
with tenant migrations and timeline deletions.
"""
neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"}
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
# We will exercise migrations, so need multiple pageservers
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start(
initial_tenant_conf={
"compaction_period": "1s",
}
)
tenant_id = env.initial_tenant
tenant_shard_id = TenantShardId(tenant_id, 0, 0)
# Unavailable pageservers during timeline CRUD operations can be logged as errors on the storage controller
env.storage_controller.allowed_errors.extend(
[
".*error sending request.*",
# FIXME: the pageserver should not return 500s on cancellation (https://github.com/neondatabase/neon/issues/97680)
".*InternalServerError\\(Error deleting timeline .* on .* on .*: pageserver API: error: Cancelled",
]
)
for ps in env.pageservers:
# We will do unclean restarts, which results in these messages when cleaning up files
ps.allowed_errors.extend(
[
".*removing local file.*because it has unexpected length.*",
".*__temp.*",
".*method=POST path=\\S+/timeline .*: Not activating a Stopping timeline.*",
# FIXME: there are still anyhow::Error paths in timeline creation/deletion which
# generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768)
".*InternalServerError.*",
# FIXME: there are still anyhow::Error paths in timeline deletion that generate
# log lines at error severity (https://github.com/neondatabase/neon/issues/9768)
".*delete_timeline.*Error",
]
)
env.storage_scrubber.allowed_errors.extend(
[
# Unclcean shutdowns of pageserver can legitimately result in orphan layers
# (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211)
f".*Orphan layer detected: tenants/{tenant_id}/.*"
]
)
class TimelineState:
def __init__(self):
self.timeline_id = TimelineId.generate()
self.created = False
self.archived = False
self.offloaded = False
self.deleted = False
controller_ps_api = env.storage_controller.pageserver_api()
shutdown = threading.Event()
violations = []
timelines_deleted = []
def list_timelines(tenant_id) -> tuple[set[TimelineId], set[TimelineId]]:
"""Get the list of active and offloaded TimelineId"""
listing = controller_ps_api.timeline_and_offloaded_list(tenant_id)
active_ids = set([TimelineId(t["timeline_id"]) for t in listing.timelines])
offloaded_ids = set([TimelineId(t["timeline_id"]) for t in listing.offloaded])
return (active_ids, offloaded_ids)
def timeline_objects(tenant_shard_id, timeline_id):
response = list_prefix(
env.pageserver_remote_storage, # type: ignore
prefix="/".join(
(
"tenants",
str(tenant_shard_id),
"timelines",
str(timeline_id),
)
)
+ "/",
)
return [k["Key"] for k in response.get("Contents", [])]
def worker():
"""
Background thread which drives timeline lifecycle operations, and checks that between steps
it obeys invariants. This should detect errors in pageserver persistence and in errors in
concurrent operations on different timelines when it is run many times in parallel.
"""
state = TimelineState()
# Jitter worker startup, we're not interested in exercising lots of concurrent creations
# as we know that's I/O bound.
shutdown.wait(random.random() * 10)
while not shutdown.is_set():
# A little wait between actions to jitter out the API calls rather than having them
# all queue up at once
shutdown.wait(random.random())
try:
if not state.created:
log.info(f"Creating timeline {state.timeline_id}")
controller_ps_api.timeline_create(
PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=state.timeline_id
)
state.created = True
if (
timeline_objects(
tenant_shard_id=tenant_shard_id, timeline_id=state.timeline_id
)
== []
):
msg = f"Timeline {state.timeline_id} unexpectedly not present in remote storage"
violations.append(msg)
elif state.deleted:
# Try to confirm its deletion completed.
# Deleted timeline should not appear in listing API, either as offloaded or active
(active_ids, offloaded_ids) = list_timelines(tenant_id)
if state.timeline_id in active_ids or state.timeline_id in offloaded_ids:
msg = f"Timeline {state.timeline_id} appeared in listing after deletion was acked"
violations.append(msg)
raise RuntimeError(msg)
objects = timeline_objects(tenant_shard_id, state.timeline_id)
if len(objects) == 0:
log.info(f"Confirmed deletion of timeline {state.timeline_id}")
timelines_deleted.append(state.timeline_id)
state = TimelineState() # A new timeline ID to create on next iteration
else:
# Deletion of objects doesn't have to be synchronous, we will keep polling
log.info(f"Timeline {state.timeline_id} objects still exist: {objects}")
shutdown.wait(random.random())
else:
# The main lifetime of a timeline: proceed active->archived->offloaded->deleted
if not state.archived:
log.info(f"Archiving timeline {state.timeline_id}")
controller_ps_api.timeline_archival_config(
tenant_id, state.timeline_id, TimelineArchivalState.ARCHIVED
)
state.archived = True
elif state.archived and not state.offloaded:
log.info(f"Waiting for offload of timeline {state.timeline_id}")
# Wait for offload: this should happen fast because we configured a short compaction interval
while not shutdown.is_set():
(active_ids, offloaded_ids) = list_timelines(tenant_id)
if state.timeline_id in active_ids:
log.info(f"Timeline {state.timeline_id} is still active")
shutdown.wait(0.5)
elif state.timeline_id in offloaded_ids:
log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
# Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
# is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
# So we wait until we see the manifest update before considering it offloaded, that way
# subsequent checks that it doesn't revert to active on a restart will pass reliably.
time.sleep(0.1)
assert isinstance(env.pageserver_remote_storage, S3Storage)
manifest = env.pageserver_remote_storage.download_tenant_manifest(
tenant_id
)
if manifest is None:
log.info(
f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
)
elif str(state.timeline_id) in [
t["timeline_id"] for t in manifest["offloaded_timelines"]
]:
log.info(
f"Timeline {state.timeline_id} is now offloaded persistently"
)
state.offloaded = True
else:
log.info(
f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
)
break
else:
# Timeline is neither offloaded nor active, this is unexpected: the pageserver
# should ensure that the timeline appears in either the offloaded list or main list
msg = f"Timeline {state.timeline_id} disappeared!"
violations.append(msg)
raise RuntimeError(msg)
elif state.offloaded:
# Once it's offloaded it should only be in offloaded or deleted state: check
# it didn't revert back to active. This tests that the manfiest is doing its
# job to suppress loading of offloaded timelines as active.
(active_ids, offloaded_ids) = list_timelines(tenant_id)
if state.timeline_id in active_ids:
msg = f"Timeline {state.timeline_id} is active, should be offloaded or deleted"
violations.append(msg)
raise RuntimeError(msg)
log.info(f"Deleting timeline {state.timeline_id}")
controller_ps_api.timeline_delete(tenant_id, state.timeline_id)
state.deleted = True
else:
raise RuntimeError("State should be unreachable")
except PageserverApiException as e:
# This is expected: we are injecting chaos, API calls will sometimes fail.
# TODO: can we narrow this to assert we are getting friendly 503s?
log.info(f"Iteration error, will retry: {e}")
shutdown.wait(random.random() * 0.5)
except requests.exceptions.RetryError as e:
# Retryable error repeated more times than `requests` is configured to tolerate, this
# is expected when a pageserver remains unavailable for a couple seconds
log.info(f"Iteration error, will retry: {e}")
shutdown.wait(random.random() * 0.5)
except Exception as e:
log.warning(
f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
)
else:
# In the non-error case, use a jitterd but small wait, we want to keep
# a high rate of operations going
shutdown.wait(random.random() * 0.1)
n_workers = 4
threads = []
for _i in range(0, n_workers):
t = threading.Thread(target=worker)
t.start()
threads.append(t)
# Set delay failpoints so that deletions and migrations take some time, and have a good
# chance to interact with other concurrent timeline mutations.
env.storage_controller.configure_failpoints(
[("reconciler-live-migrate-pre-await-lsn", "sleep(1)")]
)
for ps in env.pageservers:
ps.add_persistent_failpoint("in_progress_delete", "sleep(1)")
# Generate some chaos, while our workers are trying to complete their timeline operations
rng = random.Random()
try:
chaos_rounds = 48
for _i in range(0, chaos_rounds):
action = rng.choice([0, 1])
if action == 0:
# Pick a random pageserver to gracefully restart
pageserver = rng.choice(env.pageservers)
# Whether to use a graceful shutdown or SIGKILL
immediate = random.choice([True, False])
log.info(f"Restarting pageserver {pageserver.id}, immediate={immediate}")
t1 = time.time()
pageserver.restart(immediate=immediate)
restart_duration = time.time() - t1
# Make sure we're up for as long as we spent restarting, to ensure operations can make progress
log.info(f"Staying alive for {restart_duration}s")
time.sleep(restart_duration * 2)
else:
# Migrate our tenant between pageservers
origin_ps = env.get_tenant_pageserver(tenant_shard_id)
dest_ps = rng.choice([ps for ps in env.pageservers if ps.id != origin_ps.id])
log.info(f"Migrating {tenant_shard_id} {origin_ps.id}->{dest_ps.id}")
env.storage_controller.tenant_shard_migrate(
tenant_shard_id=tenant_shard_id, dest_ps_id=dest_ps.id
)
log.info(f"Full timeline lifecycles so far: {len(timelines_deleted)}")
finally:
shutdown.set()
for thread in threads:
thread.join()
# Sanity check that during our run we did exercise some full timeline lifecycles, in case
# one of our workers got stuck
assert len(timelines_deleted) > 5
# That no invariant-violations were reported by workers
assert violations == []
@pytest.mark.parametrize("with_intermediary", [False, True])
@pytest.mark.parametrize(
"offload_child",
[
"offload",
"offload-corrupt",
"offload-no-restart",
"offload-parent",
"archive",
None,
],
)
def test_timeline_retain_lsn(
neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: str | None
):
"""
Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
"""
if offload_child == "offload-corrupt":
# Our corruption code only works with S3 compatible storage
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.rust_log_override = "info,[gc_timeline]=debug"
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, root_timeline_id = env.create_tenant(
conf={
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": 32 * 1024,
"compaction_threshold": 1,
"compaction_target_size": 32 * 1024,
# set small image creation thresholds so that gc deletes data
"image_creation_threshold": 1,
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# Disable pitr, we only want the latest lsn
"pitr_interval": "0s",
"gc_horizon": 0,
# Don't rely on endpoint lsn leases
"lsn_lease_length": "0s",
}
)
if with_intermediary:
parent_branch_name = "test_archived_parent"
parent_timeline_id = env.create_branch("test_archived_parent", tenant_id)
else:
parent_branch_name = "main"
parent_timeline_id = root_timeline_id
with env.endpoints.create_start(parent_branch_name, tenant_id=tenant_id) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')",
"SELECT setseed(0.4321)",
"INSERT INTO foo SELECT v FROM (SELECT generate_series(1,2048), (random() * 409600)::int as v) as random_numbers",
]
)
pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
log.info(f"Pre branch sum: {pre_branch_sum}")
last_flush_lsn_upload(env, endpoint, tenant_id, parent_timeline_id)
# Create a branch and write some additional data to the parent
child_timeline_id = env.create_branch(
"test_archived_branch", tenant_id, ancestor_branch_name=parent_branch_name
)
with env.endpoints.create_start(parent_branch_name, tenant_id=tenant_id) as endpoint:
# Do some overwriting churn with compactions in between. This is important so that we can overwrite image layers.
for i in range(5):
endpoint.safe_psql_many(
[
f"SELECT setseed(0.23{i})",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 2",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 1",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0",
]
)
last_flush_lsn_upload(env, endpoint, tenant_id, parent_timeline_id)
post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
log.info(f"Post branch sum: {post_branch_sum}")
if offload_child is not None:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert leaf_detail["is_archived"] is True
if "offload" in offload_child:
ps_http.timeline_offload(tenant_id, child_timeline_id)
if "offload-parent" in offload_child:
# Also offload the parent to ensure the retain_lsn of the child
# is entered in the parent at unoffloading
ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
ps_http.timeline_offload(tenant_id, parent_timeline_id)
# Do a restart to get rid of any in-memory objects (we only init gc info once, at attach)
if offload_child is None or "no-restart" not in offload_child:
env.pageserver.stop()
if offload_child == "offload-corrupt":
assert isinstance(env.pageserver_remote_storage, S3Storage)
listing = list_prefix(
env.pageserver_remote_storage, f"tenants/{str(tenant_id)}/tenant-manifest"
)
objects: list[ObjectTypeDef] = listing.get("Contents", [])
assert len(objects) > 0
remote_key: str = str(objects[0].get("Key", []))
local_path = str(env.repo_dir / "tenant-manifest.json")
log.info(f"Downloading {remote_key} -> {local_path}")
env.pageserver_remote_storage.client.download_file(
env.pageserver_remote_storage.bucket_name, remote_key, local_path
)
log.info(f"Corrupting {local_path}")
with open(local_path) as manifest_json_file:
manifest_json = json.load(manifest_json_file)
for offloaded_timeline in manifest_json["offloaded_timelines"]:
offloaded_timeline["ancestor_retain_lsn"] = None
with open(local_path, "w") as manifest_json_file:
json.dump(manifest_json, manifest_json_file)
log.info(f"Uploading {local_path} -> {remote_key}")
env.pageserver_remote_storage.client.upload_file(
local_path, env.pageserver_remote_storage.bucket_name, remote_key
)
# The point of our earlier efforts was to provoke these
env.pageserver.allowed_errors.extend(
[
".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
".*page_service_conn_main.*could not find data for key.*",
".*failed to get checkpoint bytes.*",
".*failed to get control bytes.*",
]
)
if offload_child is None or "no-restart" not in offload_child:
env.pageserver.start()
if offload_child == "offload-parent":
wait_until_tenant_active(ps_http, tenant_id=tenant_id)
ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
# Do an agressive gc and compaction of the parent branch
ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=parent_timeline_id, gc_horizon=0)
ps_http.timeline_checkpoint(
tenant_id,
parent_timeline_id,
force_l0_compaction=True,
force_repartition=True,
wait_until_uploaded=True,
compact=True,
)
if offload_child is not None:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
# Now, after unarchival, the child timeline should still have its data accessible (or corrupted)
if offload_child == "offload-corrupt":
if with_intermediary:
error_regex = "(.*could not read .* from page server.*|.*relation .* does not exist)"
else:
error_regex = ".*failed to get basebackup.*"
with pytest.raises((RuntimeError, IoError, UndefinedTable), match=error_regex):
with env.endpoints.create_start(
"test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1
) as endpoint:
endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
else:
with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
assert sum == pre_branch_sum
def test_timeline_offload_delete_race(neon_env_builder: NeonEnvBuilder):
"""
Regression test for https://github.com/neondatabase/cloud/issues/30406
"""
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, root_timeline_id = env.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "0s",
"checkpoint_distance": f"{1024**2}",
}
)
origin_ps = env.get_tenant_pageserver(tenant_id)
assert origin_ps
origin_ps.allowed_errors.extend(
[
".*Timed out waiting for deletion queue flush.*",
".*Timed out waiting for flush to remote storage.*",
]
)
origin_ps_http = origin_ps.http_client()
# We are not sharding this tenant
tenant_shard_id = TenantShardId(tenant_id, 0, 0)
# Create a branch and archive it
child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,512)",
]
)
last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/",
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
)
origin_ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
return any(
timeline["timeline_id"] == str(timeline_id)
for timeline in origin_ps_http.timeline_and_offloaded_list(
tenant_id=tenant_id
).offloaded
)
def child_offloaded():
origin_ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
assert timeline_offloaded_api(child_timeline_id)
wait_until(child_offloaded)
# Delete the timeline from the origin pageserver, holding up the deletion queue so that it doesn't finish
failpoint_deletion_queue = "deletion-queue-before-execute-pause"
origin_ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
origin_ps_http.timeline_delete(tenant_id, child_timeline_id)
dest_ps = [ps for ps in env.pageservers if ps.id != origin_ps.id][0]
assert dest_ps
log.info(f"Migrating {tenant_id} {origin_ps.id}->{dest_ps.id}")
env.storage_controller.tenant_shard_migrate(tenant_shard_id, dest_ps_id=dest_ps.id)
log.info("unstuck the DELETE")
origin_ps_http.configure_failpoints((failpoint_deletion_queue, "off"))
def child_prefix_empty():
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/{str(child_timeline_id)}/",
)
wait_until(child_prefix_empty)
dest_ps_http = dest_ps.http_client()
# We can't use timeline_delete_wait_completed here as timeline status will return 404, but we want to return 404 from the deletion endpoint
def timeline_is_missing():
data = None
try:
data = dest_ps_http.timeline_delete(tenant_id, child_timeline_id)
log.info(f"timeline delete {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
raise RuntimeError(f"Timeline exists {data}")
wait_until(timeline_is_missing)
# (dest_ps_http, tenant_id, child_timeline_id)
#
# Now ensure that scrubber doesn't have anything to clean up.
#
# Sleep some amount larger than min_age_secs
time.sleep(3)
# Ensure that min_age_secs has a deletion impeding effect
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["tenant_manifests_deleted"] == 0
def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
"""
Test for scrubber deleting old generations of manifests
"""
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, root_timeline_id = env.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "0s",
"checkpoint_distance": f"{1024**2}",
}
)
# Create a branch and archive it
child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,512)",
]
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/",
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
)
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
# TODO add a proper API to check if a timeline has been offloaded or not
return not any(
timeline["timeline_id"] == str(timeline_id)
for timeline in ps_http.timeline_list(tenant_id=tenant_id)
)
def child_offloaded():
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
assert timeline_offloaded_api(child_timeline_id)
wait_until(child_offloaded)
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)
# Reboot the pageserver a bunch of times, do unoffloads, offloads
for i in range(5):
env.pageserver.stop()
env.pageserver.start()
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
assert not timeline_offloaded_api(child_timeline_id)
if i % 2 == 0:
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
assert sum == sum_again
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
wait_until(child_offloaded)
#
# Now ensure that scrubber runs will clean up old generations' manifests.
#
# Sleep some amount larger than min_age_secs
time.sleep(3)
# Ensure that min_age_secs has a deletion impeding effect
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["tenant_manifests_deleted"] == 0
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] > 0
assert gc_summary["tenant_manifests_deleted"] > 0
@pytest.mark.parametrize("end_with_offloaded", [False, True])
def test_timeline_offload_race_unarchive(
neon_env_builder: NeonEnvBuilder, end_with_offloaded: bool
):
"""
Ensure that unarchive and timeline offload don't race each other
"""
# Regression test for issue https://github.com/neondatabase/neon/issues/10220
failpoint = "before-timeline-auto-offload"
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, initial_timeline_id = env.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "1s",
}
)
# Create a branch
leaf_timeline_id = env.create_branch("test_ancestor_branch_archive", tenant_id)
# write some stuff to the leaf
with env.endpoints.create_start(
"test_ancestor_branch_archive", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,1000)",
]
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1")
ps_http.configure_failpoints((failpoint, "pause"))
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id,
leaf_timeline_id,
)
assert leaf_detail["is_archived"] is True
# The actual race: get the compaction task to right before
# offloading the timeline and attempt to unarchive it
wait_until(lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}"))
# This unarchival should go through
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
# TODO add a proper API to check if a timeline has been offloaded or not
return not any(
timeline["timeline_id"] == str(timeline_id)
for timeline in ps_http.timeline_list(tenant_id=tenant_id)
)
def leaf_offloaded():
assert timeline_offloaded_api(leaf_timeline_id)
# Ensure that we've hit the failed offload attempt
ps_http.configure_failpoints((failpoint, "off"))
wait_until(
lambda: env.pageserver.assert_log_contains(
f".*compaction_loop.*offload_timeline.*{leaf_timeline_id}.*can't shut down timeline.*"
)
)
with env.endpoints.create_start(
"test_ancestor_branch_archive", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1")
assert sum == sum_again
if end_with_offloaded:
# Ensure that offloading still works after all of this
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
wait_until(leaf_offloaded)
else:
# Test that deletion of leaf timeline works
ps_http.timeline_delete(tenant_id, leaf_timeline_id)