Timeline archival test (#8824)

This PR:

* Implements the rule that archived timelines require all of their
children to be archived as well, as specified in the RFC. There is no
fancy locking mechanism though, so the precondition can still be broken.
As a TODO for later, we still allow unarchiving timelines with archived
parents.
* Adds an `is_archived` flag to `TimelineInfo`
* Adds timeline_archival_config to `PageserverHttpClient`
* Adds a new `test_timeline_archive` test, loosely based on
`test_timeline_delete`

Part of #8088
This commit is contained in:
Arpad Müller
2024-08-26 17:30:19 +02:00
committed by GitHub
parent d6eede515a
commit 2dd53e7ae0
6 changed files with 207 additions and 10 deletions

View File

@@ -718,6 +718,7 @@ pub struct TimelineInfo {
pub pg_version: u32,
pub state: TimelineState,
pub is_archived: bool,
pub walreceiver_status: String,

View File

@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
}
}
impl From<crate::tenant::TimelineArchivalError> for ApiError {
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
use crate::tenant::TimelineArchivalError::*;
match value {
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
format!(
"Cannot archive timeline which has non-archived child timelines: {children:?}"
)
.into_boxed_str(),
),
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
Other(e) => ApiError::InternalServerError(e),
}
}
}
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
use crate::tenant::mgr::DeleteTimelineError::*;
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
// Report is_archived = false if the timeline is still loading
let is_archived = timeline.is_archived().unwrap_or(false);
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
pg_version: timeline.pg_version,
state,
is_archived,
walreceiver_status,
@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(
tenant
.apply_timeline_archival_config(timeline_id, request_data.state)
.await
.context("applying archival config")
.map_err(ApiError::InternalServerError)?;
.await?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_archival_config",

View File

@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
}
}
#[derive(thiserror::Error)]
pub enum TimelineArchivalError {
#[error("NotFound")]
NotFound,
#[error("Timeout")]
Timeout,
#[error("HasUnarchivedChildren")]
HasUnarchivedChildren(Vec<TimelineId>),
#[error("Timeline archival is already in progress")]
AlreadyInProgress,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl Debug for TimelineArchivalError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound => write!(f, "NotFound"),
Self::Timeout => write!(f, "Timeout"),
Self::HasUnarchivedChildren(c) => {
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
}
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
}
}
}
pub enum SetStoppingError {
AlreadyStopping(completion::Barrier),
Broken,
@@ -1326,24 +1358,50 @@ impl Tenant {
&self,
timeline_id: TimelineId,
state: TimelineArchivalState,
) -> anyhow::Result<()> {
let timeline = self
.get_timeline(timeline_id, false)
.context("Cannot apply timeline archival config to inexistent timeline")?;
) -> Result<(), TimelineArchivalError> {
info!("setting timeline archival config");
let timeline = {
let timelines = self.timelines.lock().unwrap();
let timeline = match timelines.get(&timeline_id) {
Some(t) => t,
None => return Err(TimelineArchivalError::NotFound),
};
// Ensure that there are no non-archived child timelines
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
return None;
}
if entry.is_archived() == Some(true) {
return None;
}
Some(*id)
})
.collect();
if !children.is_empty() && state == TimelineArchivalState::Archived {
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
}
Arc::clone(timeline)
};
let upload_needed = timeline
.remote_client
.schedule_index_upload_for_timeline_archival_state(state)?;
if upload_needed {
info!("Uploading new state");
const MAX_WAIT: Duration = Duration::from_secs(10);
let Ok(v) =
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
else {
tracing::warn!("reached timeout for waiting on upload queue");
bail!("reached timeout for upload queue flush");
return Err(TimelineArchivalError::Timeout);
};
v?;
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
}
Ok(())
}

View File

@@ -1,5 +1,6 @@
import random
from dataclasses import dataclass
from enum import Enum
from functools import total_ordering
from typing import Any, Dict, Type, TypeVar, Union
@@ -213,3 +214,9 @@ class TenantShardId:
def __hash__(self) -> int:
return hash(self._tuple())
# TODO: Replace with `StrEnum` when we upgrade to python 3.11
class TimelineArchivalState(str, Enum):
ARCHIVED = "Archived"
UNARCHIVED = "Unarchived"

View File

@@ -10,7 +10,7 @@ import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pg_version import PgVersion
@@ -621,6 +621,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
)
self.verbose_error(res)
def timeline_archival_config(
self,
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
state: TimelineArchivalState,
):
config = {"state": state.value}
log.info(
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
)
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
json=config,
)
self.verbose_error(res)
def timeline_get_lsn_by_timestamp(
self,
tenant_id: Union[TenantId, TenantShardId],

View File

@@ -0,0 +1,96 @@
import pytest
from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
from fixtures.neon_fixtures import (
NeonEnv,
)
from fixtures.pageserver.http import PageserverApiException
def test_timeline_archive(neon_simple_env: NeonEnv):
env = neon_simple_env
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found.*",
".*timeline not found.*",
".*Cannot archive timeline which has unarchived child timelines.*",
".*Precondition failed: Requested tenant is missing.*",
]
)
ps_http = env.pageserver.http_client()
# first try to archive non existing timeline
# for existing tenant:
invalid_timeline_id = TimelineId.generate()
with pytest.raises(PageserverApiException, match="timeline not found") as exc:
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 404
# for non existing tenant:
invalid_tenant_id = TenantId.generate()
with pytest.raises(
PageserverApiException,
match=f"NotFound: tenant {invalid_tenant_id}",
) as exc:
ps_http.timeline_archival_config(
tenant_id=invalid_tenant_id,
timeline_id=invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 404
# construct pair of branches to validate that pageserver prohibits
# archival of ancestor timelines when they have non-archived child branches
parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")
leaf_timeline_id = env.neon_cli.create_branch(
"test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
)
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
with pytest.raises(
PageserverApiException,
match="Cannot archive timeline which has non-archived child timelines",
) as exc:
assert timeline_path.exists()
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
assert exc.value.status_code == 412
# Test timeline_detail
leaf_detail = ps_http.timeline_detail(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
)
assert leaf_detail["is_archived"] is False
# Test that archiving the leaf timeline and then the parent works
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
)
assert leaf_detail["is_archived"] is True
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)