mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
storcon: signal LSN wait to pageserver during live migration (#10452)
## Problem We've seen the ingest connection manager get stuck shortly after a migration. ## Summary of changes A speculative mitigation is to use the same mechanism as get page requests for kicking LSN ingest. The connection manager monitors LSN waits and queries the broker if no updates are received for the timeline. Closes https://github.com/neondatabase/neon/issues/10351
This commit is contained in:
@@ -2,8 +2,9 @@ use pageserver_api::{
|
||||
models::{
|
||||
detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
|
||||
PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest,
|
||||
TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest,
|
||||
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest,
|
||||
TopTenantShardsResponse,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -299,4 +300,17 @@ impl PageserverClient {
|
||||
self.inner.top_tenant_shards(request).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn wait_lsn(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
request: TenantWaitLsnRequest,
|
||||
) -> Result<StatusCode> {
|
||||
measured_request!(
|
||||
"wait_lsn",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.wait_lsn(tenant_shard_id, request).await
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::persistence::Persistence;
|
||||
use crate::{compute_hook, service};
|
||||
use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -348,6 +348,32 @@ impl Reconciler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn wait_lsn(
|
||||
&self,
|
||||
node: &Node,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timelines: HashMap<TimelineId, Lsn>,
|
||||
) -> Result<StatusCode, ReconcileError> {
|
||||
const TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.service_config.jwt_token.as_deref(),
|
||||
);
|
||||
|
||||
client
|
||||
.wait_lsn(
|
||||
tenant_shard_id,
|
||||
TenantWaitLsnRequest {
|
||||
timelines,
|
||||
timeout: TIMEOUT,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
async fn get_lsns(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -461,6 +487,39 @@ impl Reconciler {
|
||||
node: &Node,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Signal to the pageserver that it should ingest up to the baseline LSNs.
|
||||
loop {
|
||||
match self.wait_lsn(node, tenant_shard_id, baseline.clone()).await {
|
||||
Ok(StatusCode::OK) => {
|
||||
// Everything is caught up
|
||||
return Ok(());
|
||||
}
|
||||
Ok(StatusCode::ACCEPTED) => {
|
||||
// Some timelines are not caught up yet.
|
||||
// They'll be polled below.
|
||||
break;
|
||||
}
|
||||
Ok(StatusCode::NOT_FOUND) => {
|
||||
// None of the timelines are present on the pageserver.
|
||||
// This is correct if they've all been deleted, but
|
||||
// let let the polling loop below cross check.
|
||||
break;
|
||||
}
|
||||
Ok(status_code) => {
|
||||
tracing::warn!(
|
||||
"Unexpected status code ({status_code}) returned by wait_lsn endpoint"
|
||||
);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::info!("🕑 Can't trigger LSN wait on {node} yet, waiting ({e})",);
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Poll the LSNs until they catch up
|
||||
loop {
|
||||
let latest = match self.get_lsns(tenant_shard_id, node).await {
|
||||
Ok(l) => l,
|
||||
|
||||
Reference in New Issue
Block a user