test_runner: check conflicting attachments in test_location_conf_churn

storcon: remove finished safekeeper reconciliations from in-memory hashmap (#11876 )
## Problem Currently there is a memory leak, in that finished safekeeper reconciliations leave a cancellation token behind which is never cleaned up. ## Summary of changes The change adds cleanup after finishing of a reconciliation. In order to ensure we remove the correct cancellation token, and we haven't raced with another reconciliation, we introduce a `TokenId` counter to tell tokens apart. Part of https://github.com/neondatabase/neon/issues/11670
2026-05-30 19:40:39 +00:00 · 2025-05-09 17:45:06 +02:00 · 2025-05-09 13:34:22 +00:00 · 2025-05-09 12:07:52 +00:00 · 2025-05-09 10:55:26 +00:00 · 2025-05-09 10:12:49 +00:00
16 changed files with 291 additions and 124 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1303,6 +1303,7 @@ dependencies = [
 "futures",
 "http 1.1.0",
 "indexmap 2.0.1",
+ "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -28,6 +28,7 @@ flate2.workspace = true
 futures.workspace = true
 http.workspace = true
 indexmap.workspace = true
+itertools.workspace = true
 jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,6 +11,7 @@ use compute_api::spec::{
 use futures::StreamExt;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
+use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
@@ -18,7 +19,7 @@ use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
@@ -1995,23 +1996,40 @@ LIMIT 100",
        tokio::spawn(conn);

        // TODO: support other types of grants apart from schemas?
-        let query = format!(
-            "GRANT {} ON SCHEMA {} TO {}",
-            privileges
-                .iter()
-                // should not be quoted as it's part of the command.
-                // is already sanitized so it's ok
-                .map(|p| p.as_str())
-                .collect::<Vec<&'static str>>()
-                .join(", "),
-            // quote the schema and role name as identifiers to sanitize them.
-            schema_name.pg_quote(),
-            role_name.pg_quote(),
-        );
-        db_client
-            .simple_query(&query)
+
+        // check the role grants first - to gracefully handle read-replicas.
+        let select = "SELECT privilege_type
+            FROM pg_namespace
+                JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
+                JOIN pg_user users ON acl.grantee = users.usesysid
+            WHERE users.usename = $1
+                AND nspname = $2";
+        let rows = db_client
+            .query(select, &[role_name, schema_name])
            .await
-            .with_context(|| format!("Failed to execute query: {}", query))?;
+            .with_context(|| format!("Failed to execute query: {select}"))?;
+
+        let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
+
+        let grants = privileges
+            .iter()
+            .filter(|p| !already_granted.contains(p.as_str()))
+            // should not be quoted as it's part of the command.
+            // is already sanitized so it's ok
+            .map(|p| p.as_str())
+            .join(", ");
+
+        if !grants.is_empty() {
+            // quote the schema and role name as identifiers to sanitize them.
+            let schema_name = schema_name.pg_quote();
+            let role_name = role_name.pg_quote();
+
+            let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1277,6 +1277,8 @@ impl Timeline {
            return Ok(CompactionOutcome::YieldForL0);
        }

+        let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
+
        // 2. Repartition and create image layers if necessary
        match self
            .repartition(
@@ -1287,7 +1289,7 @@ impl Timeline {
            )
            .await
        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::from(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -1341,6 +1343,10 @@ impl Timeline {
                }
            }

+            Ok(_) => {
+                info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
+            }
+
            // Suppress errors when cancelled.
            Err(_) if self.cancel.is_cancelled() => {}
            Err(err) if err.is_cancel() => {}
@@ -3606,6 +3612,13 @@ impl Timeline {
                    last_key = Some(key);
                }
                accumulated_values.push((key, lsn, val));
+
+                if accumulated_values.len() >= 65536 {
+                    // Assume all of them are images, that would be 512MB of data in memory for a single key.
+                    return Err(CompactionError::Other(anyhow!(
+                        "too many values for a single key, giving up gc-compaction"
+                    )));
+                }
            } else {
                let last_key: &mut Key = last_key.as_mut().unwrap();
                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -425,15 +425,12 @@ compact_prefetch_buffers(void)
 * point inside and outside PostgreSQL.
 *
 * This still does throw errors when it receives malformed responses from PS.
- *
- * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
- * IsHandlingInterrupts) we also report we've ended prefetch receive work,
- * just in case state tracking was lost due to an error in the sync getPage
- * response code.
 */
 void
-communicator_prefetch_pump_state(bool IsHandlingInterrupts)
+communicator_prefetch_pump_state(void)
 {
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive != MyPState->ring_flush)
 	{
 		NeonResponse   *response;
@@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts)
 		}
 	}

-	/* We never pump the prefetch state while handling other pages */
-	if (!IsHandlingInterrupts)
-		END_PREFETCH_RECEIVE_WORK();
+	END_PREFETCH_RECEIVE_WORK();

 	communicator_reconfigure_timeout_if_needed();
 }
@@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index)

 	Assert(MyPState->ring_unused > ring_index);

+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive <= ring_index)
 	{
-		START_PREFETCH_RECEIVE_WORK();
 		entry = GetPrfSlot(MyPState->ring_receive);

 		Assert(entry->status == PRFS_REQUESTED);
@@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index)
 			result = false;
 			break;
 		}
-
-		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
+
 	if (result)
 	{
 		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
 		PrefetchRequest *slot = GetPrfSlot(ring_index);
-		return slot->status == PRFS_RECEIVED;
+		result = slot->status == PRFS_RECEIVED;
 	}
-	return false;
+	END_PREFETCH_RECEIVE_WORK();
+
+	return result;
 ;
 }

@@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
+	Assert(readpage_reentrant_guard);

 	if (slot->status != PRFS_REQUESTED ||
 		slot->response != NULL ||
@@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
 	PrfHashEntry *entry;
 	PrefetchRequest hashkey;

+	Assert(readpage_reentrant_guard);
 	hashkey.buftag = tag;
 	entry = prfh_lookup(MyPState->prf_hash, &hashkey);
 	if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
@@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag)
 void
 prefetch_on_ps_disconnect(void)
 {
+	bool save_readpage_reentrant_guard = readpage_reentrant_guard;
 	MyPState->ring_flush = MyPState->ring_unused;

+	/* Prohibit callig of prefetch_pump_state */
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void)
 		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}

+	/* Restore guard */
+	readpage_reentrant_guard = save_readpage_reentrant_guard;
+
 	/*
 	 * We can have gone into retry due to network error, so update stats with
 	 * the latest available
@@ -2509,7 +2515,7 @@ communicator_processinterrupts(void)
 	if (timeout_signaled)
 	{
 		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
-			communicator_prefetch_pump_state(true);
+			communicator_prefetch_pump_state();

 		timeout_signaled = false;
 		communicator_reconfigure_timeout_if_needed();
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
 										  void *buffer);

 extern void communicator_reconfigure_timeout_if_needed(void);
-extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+extern void communicator_prefetch_pump_state(void);


 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	return false;
 }
@@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	return false;
 }
@@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);

@@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
@@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo

 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3886,10 +3886,10 @@ impl Service {

            None
        } else if safekeepers {
-            // Note that we do not support creating the timeline on the safekeepers
-            // for imported timelines. The `start_lsn` of the timeline is not known
-            // until the import finshes.
-            // https://github.com/neondatabase/neon/issues/11569
+            // Note that for imported timelines, we do not create the timeline on the safekeepers
+            // straight away. Instead, we do it once the import finalized such that we know what
+            // start LSN to provide for the safekeepers. This is done in
+            // [`Self::finalize_timeline_import`].
            let res = self
                .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
                .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id))
@@ -3966,11 +3966,22 @@ impl Service {
                let active = self.timeline_active_on_all_shards(&import).await?;

                match active {
-                    true => {
+                    Some(timeline_info) => {
                        tracing::info!("Timeline became active on all shards");
+
+                        if self.config.timelines_onto_safekeepers {
+                            // Now that we know the start LSN of this timeline, create it on the
+                            // safekeepers.
+                            self.tenant_timeline_create_safekeepers_until_success(
+                                import.tenant_id,
+                                timeline_info,
+                            )
+                            .await?;
+                        }
+
                        break;
                    }
-                    false => {
+                    None => {
                        tracing::info!("Timeline not active on all shards yet");

                        tokio::select! {
@@ -4004,9 +4015,6 @@ impl Service {
            .range_mut(TenantShardId::tenant_range(import.tenant_id))
            .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);

-        // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn,
-        // so we can't create the timeline on the safekeepers. Fix by moving creation here.
-        // https://github.com/neondatabase/neon/issues/11569
        tracing::info!(%import_failed, "Timeline import complete");

        Ok(())
@@ -4021,10 +4029,16 @@ impl Service {
        .await;
    }

+    /// If the timeline is active on all shards, returns the [`TimelineInfo`]
+    /// collected from shard 0.
+    ///
+    /// An error is returned if the shard layout has changed during the import.
+    /// This is guarded against within the storage controller and the pageserver,
+    /// and, therefore, unexpected.
    async fn timeline_active_on_all_shards(
        self: &Arc<Self>,
        import: &TimelineImport,
-    ) -> anyhow::Result<bool> {
+    ) -> anyhow::Result<Option<TimelineInfo>> {
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -4048,13 +4062,17 @@ impl Service {
                        .expect("Pageservers may not be deleted while referenced");
                    targets.push((*tenant_shard_id, node.clone()));
                } else {
-                    return Ok(false);
+                    return Ok(None);
                }
            }

            targets
        };

+        if targets.is_empty() {
+            anyhow::bail!("No shards found to finalize import for");
+        }
+
        let results = self
            .tenant_for_shards_api(
                targets,
@@ -4070,10 +4088,17 @@ impl Service {
            )
            .await;

-        Ok(results.into_iter().all(|res| match res {
+        let all_active = results.iter().all(|res| match res {
            Ok(info) => info.state == TimelineState::Active,
            Err(_) => false,
-        }))
+        });
+
+        if all_active {
+            // Both unwraps are validated above
+            Ok(Some(results.into_iter().next().unwrap().unwrap()))
+        } else {
+            Ok(None)
+        }
    }

    pub(crate) async fn tenant_timeline_archival_config(
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -1,4 +1,9 @@
-use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap,
+    str::FromStr,
+    sync::{Arc, atomic::AtomicU64},
+    time::Duration,
+};

 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
@@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest {
    pub(crate) kind: SafekeeperTimelineOpKind,
 }

+/// A way to keep ongoing/queued reconcile requests apart
+#[derive(Copy, Clone, PartialEq, Eq)]
+struct TokenId(u64);
+
+type OngoingTokens = ClashMap<(TenantId, Option<TimelineId>), (CancellationToken, TokenId)>;
+
 /// Handle to per safekeeper reconciler.
 struct ReconcilerHandle {
-    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
-    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
+    tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>,
+    ongoing_tokens: Arc<OngoingTokens>,
+    token_id_counter: AtomicU64,
    cancel: CancellationToken,
 }

@@ -185,24 +197,28 @@ impl ReconcilerHandle {
        &self,
        tenant_id: TenantId,
        timeline_id: Option<TimelineId>,
-    ) -> CancellationToken {
+    ) -> (CancellationToken, TokenId) {
+        let token_id = self
+            .token_id_counter
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let token_id = TokenId(token_id);
        let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
        if let Entry::Occupied(entry) = &entry {
-            let cancel: &CancellationToken = entry.get();
+            let (cancel, _) = entry.get();
            cancel.cancel();
        }
-        entry.insert(self.cancel.child_token()).clone()
+        entry.insert((self.cancel.child_token(), token_id)).clone()
    }
    /// Cancel an ongoing reconciliation
    fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
-        if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
+        if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
            cancel.cancel();
        }
    }
    fn schedule_reconcile(&self, req: ScheduleRequest) {
-        let cancel = self.new_token_slot(req.tenant_id, req.timeline_id);
+        let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
        let hostname = req.safekeeper.skp.host.clone();
-        if let Err(err) = self.tx.send((req, cancel)) {
+        if let Err(err) = self.tx.send((req, cancel, token_id)) {
            tracing::info!("scheduling request onto {hostname} returned error: {err}");
        }
    }
@@ -211,13 +227,14 @@ impl ReconcilerHandle {
 pub(crate) struct SafekeeperReconciler {
    inner: SafekeeperReconcilerInner,
    concurrency_limiter: Arc<Semaphore>,
-    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
+    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>,
    cancel: CancellationToken,
 }

 /// Thin wrapper over `Service` to not clutter its inherent functions
 #[derive(Clone)]
 struct SafekeeperReconcilerInner {
+    ongoing_tokens: Arc<OngoingTokens>,
    service: Arc<Service>,
 }

@@ -226,15 +243,20 @@ impl SafekeeperReconciler {
        // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
        let (tx, rx) = mpsc::unbounded_channel();
        let concurrency = service.config.safekeeper_reconciler_concurrency;
+        let ongoing_tokens = Arc::new(ClashMap::new());
        let mut reconciler = SafekeeperReconciler {
-            inner: SafekeeperReconcilerInner { service },
+            inner: SafekeeperReconcilerInner {
+                service,
+                ongoing_tokens: ongoing_tokens.clone(),
+            },
            rx,
            concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
            cancel: cancel.clone(),
        };
        let handle = ReconcilerHandle {
            tx,
-            ongoing_tokens: Arc::new(ClashMap::new()),
+            ongoing_tokens,
+            token_id_counter: AtomicU64::new(0),
            cancel,
        };
        tokio::spawn(async move { reconciler.run().await });
@@ -246,7 +268,9 @@ impl SafekeeperReconciler {
                req = self.rx.recv() => req,
                _ = self.cancel.cancelled() => break,
            };
-            let Some((req, req_cancel)) = req else { break };
+            let Some((req, req_cancel, req_token_id)) = req else {
+                break;
+            };

            let permit_res = tokio::select! {
                req = self.concurrency_limiter.clone().acquire_owned() => req,
@@ -265,7 +289,7 @@ impl SafekeeperReconciler {
                let timeline_id = req.timeline_id;
                let node_id = req.safekeeper.skp.id;
                inner
-                    .reconcile_one(req, req_cancel)
+                    .reconcile_one(req, req_cancel, req_token_id)
                    .instrument(tracing::info_span!(
                        "reconcile_one",
                        ?kind,
@@ -280,8 +304,14 @@ impl SafekeeperReconciler {
 }

 impl SafekeeperReconcilerInner {
-    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
+    async fn reconcile_one(
+        &self,
+        req: ScheduleRequest,
+        req_cancel: CancellationToken,
+        req_token_id: TokenId,
+    ) {
        let req_host = req.safekeeper.skp.host.clone();
+        let success;
        match req.kind {
            SafekeeperTimelineOpKind::Pull => {
                let Some(timeline_id) = req.timeline_id else {
@@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner {
                    tenant_id: req.tenant_id,
                    timeline_id,
                };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.pull_timeline(&pull_req).await,
-                    |resp| {
-                        if let Some(host) = resp.safekeeper_host {
-                            tracing::info!("pulled timeline from {host} onto {req_host}");
-                        } else {
-                            tracing::info!("timeline already present on safekeeper on {req_host}");
-                        }
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.pull_timeline(&pull_req).await,
+                        |resp| {
+                            if let Some(host) = resp.safekeeper_host {
+                                tracing::info!("pulled timeline from {host} onto {req_host}");
+                            } else {
+                                tracing::info!(
+                                    "timeline already present on safekeeper on {req_host}"
+                                );
+                            }
+                        },
+                        req_cancel,
+                    )
+                    .await;
            }
            SafekeeperTimelineOpKind::Exclude => {
                // TODO actually exclude instead of delete here
@@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner {
                    );
                    return;
                };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.delete_timeline(tenant_id, timeline_id).await,
-                    |_resp| {
-                        tracing::info!("deleted timeline from {req_host}");
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
+                        |_resp| {
+                            tracing::info!("deleted timeline from {req_host}");
+                        },
+                        req_cancel,
+                    )
+                    .await;
            }
            SafekeeperTimelineOpKind::Delete => {
                let tenant_id = req.tenant_id;
                if let Some(timeline_id) = req.timeline_id {
-                    let deleted = self
+                    success = self
                        .reconcile_inner(
-                            req,
+                            &req,
                            async |client| client.delete_timeline(tenant_id, timeline_id).await,
                            |_resp| {
                                tracing::info!("deleted timeline from {req_host}");
@@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner {
                            req_cancel,
                        )
                        .await;
-                    if deleted {
+                    if success {
                        self.delete_timeline_from_db(tenant_id, timeline_id).await;
                    }
                } else {
-                    let deleted = self
+                    success = self
                        .reconcile_inner(
-                            req,
+                            &req,
                            async |client| client.delete_tenant(tenant_id).await,
                            |_resp| {
                                tracing::info!(%tenant_id, "deleted tenant from {req_host}");
@@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner {
                            req_cancel,
                        )
                        .await;
-                    if deleted {
+                    if success {
                        self.delete_tenant_timelines_from_db(tenant_id).await;
                    }
                }
            }
        }
+        if success {
+            self.ongoing_tokens.remove_if(
+                &(req.tenant_id, req.timeline_id),
+                |_ttid, (_cancel, token_id)| {
+                    // Ensure that this request is indeed the request we just finished and not a new one
+                    req_token_id == *token_id
+                },
+            );
+        }
    }
    async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
        match self
@@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner {
            self.delete_timeline_from_db(tenant_id, timeline_id).await;
        }
    }
-    /// Returns whether the reconciliation happened successfully
+    /// Returns whether the reconciliation happened successfully (or we got cancelled)
    async fn reconcile_inner<T, F, U>(
        &self,
-        req: ScheduleRequest,
+        req: &ScheduleRequest,
        closure: impl Fn(SafekeeperClient) -> F,
        log_success: impl FnOnce(T) -> U,
        req_cancel: CancellationToken,
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -323,6 +323,42 @@ impl Service {
        })
    }

+    pub(crate) async fn tenant_timeline_create_safekeepers_until_success(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_info: TimelineInfo,
+    ) -> anyhow::Result<()> {
+        const BACKOFF: Duration = Duration::from_secs(5);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                anyhow::bail!("Shut down requested while finalizing import");
+            }
+
+            let res = self
+                .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
+                .await;
+
+            match res {
+                Ok(_) => {
+                    tracing::info!("Timeline created on safekeepers");
+                    break;
+                }
+                Err(err) => {
+                    tracing::error!("Failed to create timeline on safekeepers: {err}");
+                    tokio::select! {
+                        _ = self.cancel.cancelled() => {
+                            anyhow::bail!("Shut down requested while finalizing import");
+                        },
+                        _ = tokio::time::sleep(BACKOFF) => {}
+                    };
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    /// Directly insert the timeline into the database without reconciling it with safekeepers.
    ///
    /// Useful if the timeline already exists on the specified safekeepers,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4613,7 +4613,10 @@ class EndpointFactory:
        return self

    def new_replica(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
    ):
        branch_name = origin.branch_name
        assert origin in self.endpoints
@@ -4629,7 +4632,10 @@ class EndpointFactory:
        )

    def new_replica_start(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
    ):
        branch_name = origin.branch_name
        assert origin in self.endpoints
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -24,6 +24,7 @@ from fixtures.utils import (
    skip_in_debug_build,
    wait_until,
 )
+from fixtures.workload import Workload
 from mypy_boto3_kms import KMSClient
 from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
 from mypy_boto3_s3 import S3Client
@@ -97,6 +98,10 @@ def test_pgdata_import_smoke(
        f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
    )

+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {}
+    neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True
+
    env = neon_env_builder.init_start()

    # The test needs LocalFs support, which is only built in testing mode.
@@ -286,34 +291,28 @@ def test_pgdata_import_smoke(
    #
    # validate that we can write
    #
-    rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name,
-        endpoint_id="rw",
-        tenant_id=tenant_id,
-        config_lines=ep_config,
-    )
-    rw_endpoint.safe_psql("create table othertable(values text)")
-    rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name)
+    workload.init()
+    workload.write_rows(64)
+    workload.validate()

-    # TODO: consider using `class Workload` here
-    # to do compaction and whatnot?
+    rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()"))

    #
    # validate that we can branch (important use case)
    #

    # ... at the tip
-    _ = env.create_branch(
+    child_timeline_id = env.create_branch(
        new_branch_name="br-tip",
        ancestor_branch_name=import_branch_name,
        tenant_id=tenant_id,
        ancestor_start_lsn=rw_lsn,
    )
-    br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
-    )
-    validate_vanilla_equivalence(br_tip_endpoint)
-    br_tip_endpoint.safe_psql("select * from othertable")
+    child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
+    child_workload.validate()
+
+    validate_vanilla_equivalence(child_workload.endpoint())

    # ... at the initdb lsn
    _ = env.create_branch(
@@ -330,7 +329,7 @@ def test_pgdata_import_smoke(
    )
    validate_vanilla_equivalence(br_initdb_endpoint)
    with pytest.raises(psycopg2.errors.UndefinedTable):
-        br_initdb_endpoint.safe_psql("select * from othertable")
+        br_initdb_endpoint.safe_psql(f"select * from {workload.table}")


@run_only_on_default_postgres(reason="PG version is irrelevant here")
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -248,8 +248,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
            last_generation = max(
                [s[1] for s in last_state.values() if s[1] is not None], default=None
            )
+            is_last_generation = last_generation == generation
+            
+            # It's also only valid to connect if there are no conflicting attachments (i.e. no
+            # Pageserver in AttachedSingle with other attachments in same generation).
+            num_attached_single = len(
+                [s for s in last_state.values() if s[0] == "AttachedSingle" and s[1] == last_generation]
+            )
+            num_attached = len(
+                [s for s in last_state.values() if s[0].startswith("Attached") and s[1] == last_generation]
+            )
+            valid_attachment = (num_attached_single == 1 and num_attached == 1) \
+                or num_attached_single == 0

-            if mode.startswith("Attached") and generation == last_generation:
+            if mode.startswith("Attached") and is_last_generation and valid_attachment:
                # This is a basic test: we are validating that he endpoint works properly _between_
                # configuration changes.  A stronger test would be to validate that clients see
                # no errors while we are making the changes.
--- a/test_runner/regress/test_role_grants.py
+++ b/test_runner/regress/test_role_grants.py
@@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv):
        res = cur.fetchall()

        assert res == [(1,)], "select should not succeed"
+
+    # confirm that replicas can also ensure the grants are correctly set.
+    replica = env.endpoints.new_replica_start(endpoint)
+    replica_client = replica.http_client()
+    replica_client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,7 +5,7 @@
  ],
  "v16": [
    "16.8",
-    "05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
+    "d72d76f2cdee4194dd052ce099e9784aca7c794a"
  ],
  "v15": [
    "15.12",
Author	SHA1	Message	Date
Erik Grinaker	fc9a358e1c	test_runner: check conflicting attachments in `test_location_conf_churn`	2025-05-09 17:45:06 +02:00
Arpad Müller	33abfc2b74	storcon: remove finished safekeeper reconciliations from in-memory hashmap (#11876 ) ## Problem Currently there is a memory leak, in that finished safekeeper reconciliations leave a cancellation token behind which is never cleaned up. ## Summary of changes The change adds cleanup after finishing of a reconciliation. In order to ensure we remove the correct cancellation token, and we haven't raced with another reconciliation, we introduce a `TokenId` counter to tell tokens apart. Part of https://github.com/neondatabase/neon/issues/11670	2025-05-09 13:34:22 +00:00
Alex Chi Z.	93b964f829	fix(pageserver): do not do image compaction if it's below gc cutoff (#11872 ) ## Problem We observe image compaction errors after gc-compaction finishes compacting below the gc_cutoff. This is because `repartition` returns an LSN below the gc horizon as we (likely) determined that `distance <= self.repartition_threshold`. I think it's better to keep the current behavior of when to trigger compaction but we should skip image compaction if the returned LSN is below the gc horizon. ## Summary of changes If the repartition returns an invalid LSN, skip image compaction. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-05-09 12:07:52 +00:00
Vlad Lazar	d0aaec2abb	storage_controller: create imported timelines on safekeepers (#11801 ) ## Problem SK timeline creations were skipped for imported timelines since we didn't know the correct start LSN of the timeline at that point. ## Summary of changes Created imported timelines on the SK as part of the import finalize step. We use the last record LSN of shard 0 as the start LSN for the safekeeper timeline. Closes https://github.com/neondatabase/neon/issues/11569	2025-05-09 10:55:26 +00:00
Alex Chi Z.	d0dc65da12	fix(pageserver): give up gc-compaction if one key has too long history (#11869 ) ## Problem The limitation we imposed last week https://github.com/neondatabase/neon/pull/11709 is not enough to protect excessive memory usage. ## Summary of changes If a single key accumulated too much history, give up compaction. In the future, we can make the `generate_key_retention` function take a stream of keys instead of first accumulating them in memory, thus easily support such long key history cases. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-05-09 10:12:49 +00:00
Konstantin Knizhnik	03d635b916	Add more guards for prefetch_pump_state (#11859 ) ## Problem See https://neondb.slack.com/archives/C08PJ07BZ44/p1746566292750689 Looks like there are more cases when `prefetch_pump_state` can be called in unexpected place and cause core dump. ## Summary of changes Add more guards. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-09 09:07:08 +00:00
Conrad Ludgate	5cd7f936f9	fix(neon-rls): optimistically assume role grants are already assigned for replicas (#11811 ) ## Problem Read replicas cannot grant permissions for roles for Neon RLS. Usually the permission is already granted, so we can optimistically check. See INC-509 ## Summary of changes Perform a permission lookup prior to actually executing any grants.	2025-05-09 07:48:30 +00:00