mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
Shut down timelines during offload and add offload tests (#9289)
Add a test for timeline offloading, and subsequent unoffloading. Also adds a manual endpoint, and issues a proper timeline shutdown during offloading which prevents a pageserver hang at shutdown. Part of #8088.
This commit is contained in:
@@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::offload::offload_timeline;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
@@ -325,6 +326,7 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
e @ HasArchivedParent(_) => {
|
||||
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
|
||||
}
|
||||
@@ -1785,6 +1787,49 @@ async fn timeline_compact_handler(
|
||||
.await
|
||||
}
|
||||
|
||||
// Run offload immediately on given timeline.
|
||||
async fn timeline_offload_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
if tenant.get_offloaded_timeline(timeline_id).is_ok() {
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
if !tenant.timeline_has_no_attached_children(timeline_id) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"timeline has attached children".into(),
|
||||
));
|
||||
}
|
||||
if !timeline.can_offload() {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"Timeline::can_offload() returned false".into(),
|
||||
));
|
||||
}
|
||||
offload_timeline(&tenant, &timeline)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
@@ -3008,6 +3053,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
|
||||
|r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
|
||||
|
||||
@@ -619,6 +619,9 @@ pub enum TimelineArchivalError {
|
||||
#[error("Timeout")]
|
||||
Timeout,
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error("ancestor is archived: {}", .0)]
|
||||
HasArchivedParent(TimelineId),
|
||||
|
||||
@@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::Timeout => write!(f, "Timeout"),
|
||||
Self::Cancelled => write!(f, "Cancelled"),
|
||||
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
|
||||
Self::HasUnarchivedChildren(c) => {
|
||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||
@@ -1552,6 +1556,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
let cancel = self.cancel.clone();
|
||||
let timeline_preload = self
|
||||
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
|
||||
@@ -1566,6 +1571,7 @@ impl Tenant {
|
||||
error!(%timeline_id, "index_part not found on remote");
|
||||
return Err(TimelineArchivalError::NotFound);
|
||||
}
|
||||
Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled),
|
||||
Err(e) => {
|
||||
// Some (possibly ephemeral) error happened during index_part download.
|
||||
warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
|
||||
@@ -1603,6 +1609,7 @@ impl Tenant {
|
||||
if offloaded_timelines.remove(&timeline_id).is_none() {
|
||||
warn!("timeline already removed from offloaded timelines");
|
||||
}
|
||||
info!("timeline unoffloading complete");
|
||||
Ok(Arc::clone(timeline))
|
||||
} else {
|
||||
warn!("timeline not available directly after attach");
|
||||
@@ -1683,6 +1690,21 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_offloaded_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<OffloadedTimeline>, GetTimelineError> {
|
||||
self.timelines_offloaded
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timeline_id)
|
||||
.map(Arc::clone)
|
||||
.ok_or(GetTimelineError::NotFound {
|
||||
tenant_id: self.tenant_shard_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -2218,6 +2240,13 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
!timelines
|
||||
.iter()
|
||||
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id))
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
|
||||
@@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline(
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
|
||||
@@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_offload(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload",
|
||||
)
|
||||
log.info(f"Got offload request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_compact(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
|
||||
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [0, 4])
|
||||
@@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("manual_offload", [False, True])
|
||||
def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Turn off gc and compaction loops: we want to issue them manually for better reliability
|
||||
tenant_id, initial_timeline_id = env.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s" if manual_offload else "1s",
|
||||
}
|
||||
)
|
||||
|
||||
# Create two branches and archive them
|
||||
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
|
||||
leaf_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
|
||||
)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
leaf_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
)
|
||||
assert leaf_detail["is_archived"] is True
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
|
||||
def timeline_offloaded(timeline_id: TimelineId) -> bool:
|
||||
return (
|
||||
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
|
||||
is not None
|
||||
)
|
||||
|
||||
if manual_offload:
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="timeline has attached children",
|
||||
):
|
||||
# This only tests the (made for testing only) http handler,
|
||||
# but still demonstrates the constraints we have.
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
|
||||
|
||||
def parent_offloaded():
|
||||
if manual_offload:
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
|
||||
assert timeline_offloaded(parent_timeline_id)
|
||||
|
||||
def leaf_offloaded():
|
||||
if manual_offload:
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
|
||||
assert timeline_offloaded(leaf_timeline_id)
|
||||
|
||||
wait_until(30, 1, leaf_offloaded)
|
||||
wait_until(30, 1, parent_offloaded)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
leaf_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
)
|
||||
assert leaf_detail["is_archived"] is False
|
||||
|
||||
assert not timeline_offloaded(initial_timeline_id)
|
||||
|
||||
Reference in New Issue
Block a user