feat(pageserver): add force patch index_part API (#11119)

## Problem As part of the disaster recovery tool. Partly for https://github.com/neondatabase/neon/issues/9114. ## Summary of changes * Add a new pageserver API to force patch the fields in index_part and modify the timeline internal structures. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-06-03 13:30:38 +00:00 · 2025-03-07 12:42:52 -05:00
parent e876794ce5
commit cd438406fb
5 changed files with 129 additions and 2 deletions
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1146,6 +1146,15 @@ pub struct TimelineArchivalConfigRequest {
    pub state: TimelineArchivalState,
 }

+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelinePatchIndexPartRequest {
+    pub rel_size_migration: Option<RelSizeMigration>,
+    pub gc_compaction_last_completed_lsn: Option<Lsn>,
+    pub applied_gc_cutoff_lsn: Option<Lsn>,
+    #[serde(default)]
+    pub force_index_update: bool,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelinesInfoAndOffloaded {
    pub timelines: Vec<TimelineInfo>,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,7 +37,8 @@ use pageserver_api::models::{
    TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
    TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
    TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
-    TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
+    TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
+    TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -63,6 +64,7 @@ use crate::tenant::mgr::{
    GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
 };
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
    download_index_part, list_remote_tenant_shards, list_remote_timelines,
 };
@@ -858,6 +860,75 @@ async fn timeline_archival_config_handler(
    json_response(StatusCode::OK, ())
 }

+/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
+/// measure only.
+///
+/// Some examples of safe patches:
+/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
+/// - Force set the index part to use reldir v2 (migrating/migrated).
+///
+/// Some examples of unsafe patches:
+/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
+///   errors.
+/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
+async fn timeline_patch_index_part_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
+    check_permission(&request, None)?; // require global permission for this request
+    let state = get_state(&request);
+
+    async {
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if let Some(rel_size_migration) = request_data.rel_size_migration {
+            timeline
+                .update_rel_size_v2_status(rel_size_migration)
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(gc_compaction_last_completed_lsn) =
+            request_data.gc_compaction_last_completed_lsn
+        {
+            timeline
+                .update_gc_compaction_state(GcCompactionState {
+                    last_completed_lsn: gc_compaction_last_completed_lsn,
+                })
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
+            {
+                let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
+                guard.store_and_unlock(applied_gc_cutoff_lsn);
+            }
+        }
+
+        if request_data.force_index_update {
+            timeline
+                .remote_client
+                .force_schedule_index_upload()
+                .context("force schedule index upload")
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_patch_index_part",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -3629,6 +3700,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
+            |r| api_handler(r, timeline_patch_index_part_handler),
+        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
            |r| api_handler(r, lsn_lease_handler),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -954,6 +954,14 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
+    pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background (internal function)
    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
        self.verbose_error(res)

+    def timeline_patch_index_part(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+        data: dict[str, Any],
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part",
+            json=data,
+        )
+        self.verbose_error(res)
+        return res.json()
+
    def tenant_location_conf(
        self,
        tenant_id: TenantId | TenantShardId,
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until


 def check_client(env: NeonEnv, client: PageserverHttpClient):
@@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde

    with env.pageserver.http_client(auth_token=pageserver_token) as client:
        check_client(env, client)
+
+
+@run_only_on_default_postgres("it does not use any postgres functionality")
+def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    with env.pageserver.http_client() as client:
+        client.timeline_patch_index_part(
+            tenant_id,
+            timeline_id,
+            {"rel_size_migration": "migrating"},
+        )
+        assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating"
+        # This is invalid in practice: we should never rollback the migrating state to legacy.
+        # But we do it here to test the API.
+        client.timeline_patch_index_part(
+            tenant_id,
+            timeline_id,
+            {"rel_size_migration": "legacy"},
+        )
+        assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"