storage_controller: reconcile completed imports at start-up (#11614)

## Problem

In https://github.com/neondatabase/neon/pull/11345 coordination of
imports moved to the storage controller.
It involves notifying cplane when the import has been completed by
calling an idempotent endpoint.

If the storage controller shuts down in the middle of finalizing an
import, it would never be retried.

## Summary of changes

Reconcile imports at start-up by fetching the complete imports from the
database and spawning a background
task which notifies cplane.

Closes: https://github.com/neondatabase/neon/issues/11570
This commit is contained in:
Vlad Lazar
2025-04-24 19:39:19 +01:00
committed by GitHub
parent 6f7e3c18e4
commit 5ba7315c84
5 changed files with 169 additions and 19 deletions

View File

@@ -133,6 +133,7 @@ pub(crate) enum DatabaseOperation {
InsertTimelineImport,
UpdateTimelineImport,
DeleteTimelineImport,
ListTimelineImports,
}
#[must_use]
@@ -1640,6 +1641,35 @@ impl Persistence {
.await
}
pub(crate) async fn list_complete_timeline_imports(
&self,
) -> DatabaseResult<Vec<TimelineImport>> {
use crate::schema::timeline_imports::dsl;
let persistent = self
.with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| {
Box::pin(async move {
let from_db: Vec<TimelineImportPersistence> =
dsl::timeline_imports.load(conn).await?;
Ok(from_db)
})
})
.await?;
let imports: Result<Vec<TimelineImport>, _> = persistent
.into_iter()
.map(TimelineImport::from_persistent)
.collect();
match imports {
Ok(ok) => Ok(ok
.into_iter()
.filter(|import| import.is_complete())
.collect()),
Err(err) => Err(DatabaseError::Logical(format!(
"failed to deserialize import: {err}"
))),
}
}
pub(crate) async fn delete_timeline_import(
&self,
tenant_id: TenantId,

View File

@@ -878,6 +878,22 @@ impl Service {
});
}
// Fetch the list of completed imports and attempt to finalize them in the background.
// This handles the case where the previous storage controller instance shut down
// whilst finalizing imports.
let complete_imports = self.persistence.list_complete_timeline_imports().await;
match complete_imports {
Ok(ok) => {
tokio::task::spawn({
let finalize_imports_self = self.clone();
async move { finalize_imports_self.finalize_timeline_imports(ok).await }
});
}
Err(err) => {
tracing::error!("Could not retrieve completed imports from database: {err}");
}
}
tracing::info!(
"Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"
);
@@ -3869,13 +3885,10 @@ impl Service {
self: &Arc<Self>,
import: TimelineImport,
) -> anyhow::Result<()> {
// TODO(vlad): On start-up, load up the imports and notify cplane of the
// ones that have been completed. This assumes the new cplane API will
// be idempotent. If that's not possible, bang a flag in the database.
// https://github.com/neondatabase/neon/issues/11570
tracing::info!("Finalizing timeline import");
pausable_failpoint!("timeline-import-pre-cplane-notification");
let import_failed = import.completion_error().is_some();
if !import_failed {
@@ -3926,6 +3939,15 @@ impl Service {
Ok(())
}
async fn finalize_timeline_imports(self: &Arc<Self>, imports: Vec<TimelineImport>) {
futures::future::join_all(
imports
.into_iter()
.map(|import| self.finalize_timeline_import(import)),
)
.await;
}
async fn timeline_active_on_all_shards(
self: &Arc<Self>,
import: &TimelineImport,

View File

@@ -103,7 +103,7 @@ impl TimelineImport {
let crnt = occ.get_mut();
if *crnt == status {
Ok(TimelineImportUpdateFollowUp::None)
} else if crnt.is_terminal() && !status.is_terminal() {
} else if crnt.is_terminal() && *crnt != status {
Err(TimelineImportUpdateError::UnexpectedUpdate)
} else {
*crnt = status;