diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index fabfe28aa2..749a8acc4e 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1146,6 +1146,15 @@ pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelinePatchIndexPartRequest { + pub rel_size_migration: Option, + pub gc_compaction_last_completed_lsn: Option, + pub applied_gc_cutoff_lsn: Option, + #[serde(default)] + pub force_index_update: bool, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelinesInfoAndOffloaded { pub timelines: Vec, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cd79aa6680..3c0c23a56d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -37,7 +37,8 @@ use pageserver_api::models::{ TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, - TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, + TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem, + TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; @@ -63,6 +64,7 @@ use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, }; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ download_index_part, list_remote_tenant_shards, list_remote_timelines, }; @@ -858,6 +860,75 @@ async fn timeline_archival_config_handler( json_response(StatusCode::OK, ()) } +/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency +/// measure only. +/// +/// Some examples of safe patches: +/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors. +/// - Force set the index part to use reldir v2 (migrating/migrated). +/// +/// Some examples of unsafe patches: +/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause +/// errors. +/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background. +async fn timeline_patch_index_part_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?; + check_permission(&request, None)?; // require global permission for this request + let state = get_state(&request); + + async { + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if let Some(rel_size_migration) = request_data.rel_size_migration { + timeline + .update_rel_size_v2_status(rel_size_migration) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(gc_compaction_last_completed_lsn) = + request_data.gc_compaction_last_completed_lsn + { + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: gc_compaction_last_completed_lsn, + }) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn { + { + let guard = timeline.applied_gc_cutoff_lsn.lock_for_write(); + guard.store_and_unlock(applied_gc_cutoff_lsn); + } + } + + if request_data.force_index_update { + timeline + .remote_client + .force_schedule_index_upload() + .context("force schedule index upload") + .map_err(ApiError::InternalServerError)?; + } + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_patch_index_part", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -3629,6 +3700,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part", + |r| api_handler(r, timeline_patch_index_part_handler), + ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", |r| api_handler(r, lsn_lease_handler), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index a784a05972..891760b499 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -954,6 +954,14 @@ impl RemoteTimelineClient { Ok(()) } + /// Only used in the `patch_index_part` HTTP API to force trigger an index upload. + pub fn force_schedule_index_upload(self: &Arc) -> Result<(), NotInitialized> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// Launch an index-file upload operation in the background (internal function) fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 364aff325d..0efe0b9575 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def timeline_patch_index_part( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + data: dict[str, Any], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part", + json=data, + ) + self.verbose_error(res) + return res.json() + def tenant_location_conf( self, tenant_id: TenantId | TenantShardId, diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 55fd7a8608..17ffeca23b 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until def check_client(env: NeonEnv, client: PageserverHttpClient): @@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde with env.pageserver.http_client(auth_token=pageserver_token) as client: check_client(env, client) + + +@run_only_on_default_postgres("it does not use any postgres functionality") +def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + with env.pageserver.http_client() as client: + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "migrating"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating" + # This is invalid in practice: we should never rollback the migrating state to legacy. + # But we do it here to test the API. + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "legacy"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"