feat(pageserver): add force patch index_part API (#11119)

## Problem

As part of the disaster recovery tool. Partly for
https://github.com/neondatabase/neon/issues/9114.

## Summary of changes

* Add a new pageserver API to force patch the fields in index_part and
modify the timeline internal structures.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-03-07 12:42:52 -05:00
committed by GitHub
parent e876794ce5
commit cd438406fb
5 changed files with 129 additions and 2 deletions

View File

@@ -1146,6 +1146,15 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelinePatchIndexPartRequest {
pub rel_size_migration: Option<RelSizeMigration>,
pub gc_compaction_last_completed_lsn: Option<Lsn>,
pub applied_gc_cutoff_lsn: Option<Lsn>,
#[serde(default)]
pub force_index_update: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,

View File

@@ -37,7 +37,8 @@ use pageserver_api::models::{
TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
TopTenantShardsRequest, TopTenantShardsResponse,
};
use pageserver_api::shard::{ShardCount, TenantShardId};
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -63,6 +64,7 @@ use crate::tenant::mgr::{
GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
};
use crate::tenant::remote_timeline_client::index::GcCompactionState;
use crate::tenant::remote_timeline_client::{
download_index_part, list_remote_tenant_shards, list_remote_timelines,
};
@@ -858,6 +860,75 @@ async fn timeline_archival_config_handler(
json_response(StatusCode::OK, ())
}
/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
/// measure only.
///
/// Some examples of safe patches:
/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
/// - Force set the index part to use reldir v2 (migrating/migrated).
///
/// Some examples of unsafe patches:
/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
/// errors.
/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
async fn timeline_patch_index_part_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
check_permission(&request, None)?; // require global permission for this request
let state = get_state(&request);
async {
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
if let Some(rel_size_migration) = request_data.rel_size_migration {
timeline
.update_rel_size_v2_status(rel_size_migration)
.map_err(ApiError::InternalServerError)?;
}
if let Some(gc_compaction_last_completed_lsn) =
request_data.gc_compaction_last_completed_lsn
{
timeline
.update_gc_compaction_state(GcCompactionState {
last_completed_lsn: gc_compaction_last_completed_lsn,
})
.map_err(ApiError::InternalServerError)?;
}
if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
{
let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
guard.store_and_unlock(applied_gc_cutoff_lsn);
}
}
if request_data.force_index_update {
timeline
.remote_client
.force_schedule_index_upload()
.context("force schedule index upload")
.map_err(ApiError::InternalServerError)?;
}
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_patch_index_part",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -3629,6 +3700,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
|r| api_handler(r, timeline_patch_index_part_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),

View File

@@ -954,6 +954,14 @@ impl RemoteTimelineClient {
Ok(())
}
/// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_index_upload(upload_queue);
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();

View File

@@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
self.verbose_error(res)
def timeline_patch_index_part(
self,
tenant_id: TenantId | TenantShardId,
timeline_id: TimelineId,
data: dict[str, Any],
):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part",
json=data,
)
self.verbose_error(res)
return res.json()
def tenant_location_conf(
self,
tenant_id: TenantId | TenantShardId,

View File

@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.utils import wait_until
from fixtures.utils import run_only_on_default_postgres, wait_until
def check_client(env: NeonEnv, client: PageserverHttpClient):
@@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
with env.pageserver.http_client(auth_token=pageserver_token) as client:
check_client(env, client)
@run_only_on_default_postgres("it does not use any postgres functionality")
def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
with env.pageserver.http_client() as client:
client.timeline_patch_index_part(
tenant_id,
timeline_id,
{"rel_size_migration": "migrating"},
)
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating"
# This is invalid in practice: we should never rollback the migrating state to legacy.
# But we do it here to test the API.
client.timeline_patch_index_part(
tenant_id,
timeline_id,
{"rel_size_migration": "legacy"},
)
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"