diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index fc1f10e734..ae5a21bab9 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,6 @@ use utils::id::TimelineId; -#[derive(Default, serde::Serialize)] +#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { pub reparented_timelines: Vec, } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index e3ddb446fa..ac3ff1bb89 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use bytes::Bytes; +use detach_ancestor::AncestorDetached; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ @@ -418,6 +419,23 @@ impl Client { } } + pub async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", + self.mgmt_api_endpoint + ); + + self.request(Method::PUT, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 3a62c0dd4f..9ddf98eb3b 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_detach_ancestor( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let res = service + .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, res) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -1006,6 +1022,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_detach_ancestor, + RequestName("v1_tenant_timeline_detach_ancestor"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 769aba80ca..8d64201cd9 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,8 +1,9 @@ use pageserver_api::{ models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, + PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -226,6 +227,21 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline_detach_ancestor", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index deaac83ea5..95522525cb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -117,6 +117,7 @@ enum TenantOperations { TimelineCreate, TimelineDelete, AttachHook, + TimelineDetachAncestor, } #[derive(Clone, strum_macros::Display)] @@ -2376,18 +2377,18 @@ impl Service { tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); client - .tenant_time_travel_remote_storage( - tenant_shard_id, - ×tamp, - &done_if_after, - ) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", - node - )) - })?; + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; } } Ok(()) @@ -2757,7 +2758,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = self.config.jwt_token.clone(); + let jwt = &self.config.jwt_token; self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { let create_req = create_req.clone(); Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) @@ -2768,6 +2769,115 @@ impl Service { Ok(timeline_info) } + pub(crate) async fn tenant_timeline_detach_ancestor( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDetachAncestor, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; + + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => { + ApiError::Conflict(format!("{node}: {msg}")) + } + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped + other => passthrough_api_error(&node, other), + } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + let any = results.pop().expect("we must have at least one response"); + + // FIXME: the ordering is not stable yet on pageserver, should be (ancestor_lsn, + // TimelineId) + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + } + + Ok(any.1) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -2894,8 +3004,8 @@ impl Service { .await .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) }) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 463e4a3b01..90ed838e1d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2400,7 +2400,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} """ response = self.request( "GET", diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 606ce203cd..803fcac583 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -11,11 +11,12 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException -from fixtures.pageserver.utils import wait_timeline_detail_404 -from fixtures.remote_storage import LocalFsStorage +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import assert_pageserver_backups_equal @@ -559,11 +560,24 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) -def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool): + shards = 2 if sharded else 1 - client = env.pageserver.http_client() + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + if sharded: + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + client = env.storage_controller.pageserver_api() + else: + client = env.pageserver.http_client() with pytest.raises(PageserverApiException, match=".* no ancestors") as info: client.detach_ancestor(env.initial_tenant, env.initial_timeline) @@ -577,6 +591,17 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): client.detach_ancestor(env.initial_tenant, second_branch) assert info.value.status_code == 400 + client.detach_ancestor(env.initial_tenant, first_branch) + + # FIXME: this should be done by the http req handler + for ps in pageservers.values(): + ps.quiesce_tenants() + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, first_branch) + # FIXME: this should be 200 OK because we've already completed it + assert info.value.status_code == 409 + client.tenant_delete(env.initial_tenant) with pytest.raises(PageserverApiException) as e: @@ -584,6 +609,58 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): assert e.value.status_code == 404 +def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): + branch_name = "soon_detached" + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + + branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)" + ) + lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + + assert Lsn(detail["last_record_lsn"]) >= lsn + assert Lsn(detail["initdb_lsn"]) < lsn + assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline + + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, branch_timeline_id) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + + # TODO: ensure quescing is done on pageserver? + pageservers[node_id].quiesce_tenants() + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + wait_for_last_record_lsn( + pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn + ) + assert detail.get("ancestor_timeline_id") is None + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + count = int(ep.safe_psql("select count(*) from foo")[0][0]) + assert count == 10000 + + # TODO: # - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted @@ -591,3 +668,11 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): # - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. +# +# TEST: 1. tad which partially succeeds, one returns 500 +# 2. create branch below timeline? or delete timeline below +# 3. on retry all should report the same reparented timelines +# +# TEST: 1. tad is started, one node stalls, other restarts +# 2. client timeout before stall over +# 3. on retry with stalled and other being able to proceed