storcon: implement safekeeper_migrate handler (#11849)

This PR implements a safekeeper migration algorithm from RFC-035


https://github.com/neondatabase/neon/blob/main/docs/rfcs/035-safekeeper-dynamic-membership-change.md#change-algorithm

- Closes: https://github.com/neondatabase/neon/issues/11823

It is not production-ready yet, but I think it's good enough to commit
and start testing.

There are some known issues which will be addressed in later PRs:
- https://github.com/neondatabase/neon/issues/12186
- https://github.com/neondatabase/neon/issues/12187
- https://github.com/neondatabase/neon/issues/12188
- https://github.com/neondatabase/neon/issues/12189
- https://github.com/neondatabase/neon/issues/12190
- https://github.com/neondatabase/neon/issues/12191
- https://github.com/neondatabase/neon/issues/12192

## Summary of changes
- Implement `tenant_timeline_safekeeper_migrate` handler to drive the
migration
- Add possibility to specify number of safekeepers per timeline in tests
(`timeline_safekeeper_count`)
- Add `term` and `flush_lsn` to `TimelineMembershipSwitchResponse`
- Implement compare-and-swap (CAS) operation over timeline in DB for
updating membership configuration safely.
- Write simple test to verify that migration code works
This commit is contained in:
Dmitrii Kovalkov
2025-06-30 12:30:05 +04:00
committed by GitHub
parent 9bb4688c54
commit c746678bbc
15 changed files with 802 additions and 79 deletions

View File

@@ -22,7 +22,7 @@ use pageserver_api::controller_api::{
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
TimelineImportRequest,
TimelineImportRequest, TimelineSafekeeperMigrateRequest,
};
use pageserver_api::models::{
DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -34,6 +34,7 @@ use pageserver_api::upcall_api::{
PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest,
};
use pageserver_client::{BlockUnblock, mgmt_api};
use routerify::Middleware;
use tokio_util::sync::CancellationToken;
use tracing::warn;
@@ -635,6 +636,32 @@ async fn handle_tenant_timeline_download_heatmap_layers(
json_response(StatusCode::OK, ())
}
async fn handle_tenant_timeline_safekeeper_migrate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
maybe_rate_limit(&req, tenant_id).await;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
let mut req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let migrate_req = json_request::<TimelineSafekeeperMigrateRequest>(&mut req).await?;
service
.tenant_timeline_safekeeper_migrate(tenant_id, timeline_id, migrate_req)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_timeline_lsn_lease(
service: Arc<Service>,
req: Request<Body>,
@@ -2458,6 +2485,16 @@ pub fn make_router(
)
},
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate",
|r| {
tenant_service_handler(
r,
handle_tenant_timeline_safekeeper_migrate,
RequestName("v1_tenant_timeline_safekeeper_migrate"),
)
},
)
// LSN lease passthrough to all shards
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",