Compare commits

...

7 Commits

Author SHA1 Message Date
Erik Grinaker
fc9a358e1c test_runner: check conflicting attachments in test_location_conf_churn 2025-05-09 17:45:06 +02:00
Arpad Müller
33abfc2b74 storcon: remove finished safekeeper reconciliations from in-memory hashmap (#11876)
## Problem

Currently there is a memory leak, in that finished safekeeper
reconciliations leave a cancellation token behind which is never cleaned
up.

## Summary of changes

The change adds cleanup after finishing of a reconciliation. In order to
ensure we remove the correct cancellation token, and we haven't raced
with another reconciliation, we introduce a `TokenId` counter to tell
tokens apart.

Part of https://github.com/neondatabase/neon/issues/11670
2025-05-09 13:34:22 +00:00
Alex Chi Z.
93b964f829 fix(pageserver): do not do image compaction if it's below gc cutoff (#11872)
## Problem

We observe image compaction errors after gc-compaction finishes
compacting below the gc_cutoff. This is because `repartition` returns an
LSN below the gc horizon as we (likely) determined that `distance <=
self.repartition_threshold`.

I think it's better to keep the current behavior of when to trigger
compaction but we should skip image compaction if the returned LSN is
below the gc horizon.

## Summary of changes

If the repartition returns an invalid LSN, skip image compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-05-09 12:07:52 +00:00
Vlad Lazar
d0aaec2abb storage_controller: create imported timelines on safekeepers (#11801)
## Problem

SK timeline creations were skipped for imported timelines since we
didn't know the correct start LSN
of the timeline at that point.

## Summary of changes

Created imported timelines on the SK as part of the import finalize
step.
We use the last record LSN of shard 0 as the start LSN for the
safekeeper timeline.

Closes https://github.com/neondatabase/neon/issues/11569
2025-05-09 10:55:26 +00:00
Alex Chi Z.
d0dc65da12 fix(pageserver): give up gc-compaction if one key has too long history (#11869)
## Problem

The limitation we imposed last week
https://github.com/neondatabase/neon/pull/11709 is not enough to protect
excessive memory usage.

## Summary of changes

If a single key accumulated too much history, give up compaction. In the
future, we can make the `generate_key_retention` function take a stream
of keys instead of first accumulating them in memory, thus easily
support such long key history cases.

Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-05-09 10:12:49 +00:00
Konstantin Knizhnik
03d635b916 Add more guards for prefetch_pump_state (#11859)
## Problem

See https://neondb.slack.com/archives/C08PJ07BZ44/p1746566292750689

Looks like there are more cases when `prefetch_pump_state` can be called
in unexpected place and cause core dump.

## Summary of changes

Add more guards.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2025-05-09 09:07:08 +00:00
Conrad Ludgate
5cd7f936f9 fix(neon-rls): optimistically assume role grants are already assigned for replicas (#11811)
## Problem

Read replicas cannot grant permissions for roles for Neon RLS. Usually
the permission is already granted, so we can optimistically check. See
INC-509

## Summary of changes

Perform a permission lookup prior to actually executing any grants.
2025-05-09 07:48:30 +00:00
16 changed files with 291 additions and 124 deletions

1
Cargo.lock generated
View File

@@ -1303,6 +1303,7 @@ dependencies = [
"futures",
"http 1.1.0",
"indexmap 2.0.1",
"itertools 0.10.5",
"jsonwebtoken",
"metrics",
"nix 0.27.1",

View File

@@ -28,6 +28,7 @@ flate2.workspace = true
futures.workspace = true
http.workspace = true
indexmap.workspace = true
itertools.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true

View File

@@ -11,6 +11,7 @@ use compute_api::spec::{
use futures::StreamExt;
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use itertools::Itertools;
use nix::sys::signal::{Signal, kill};
use nix::unistd::Pid;
use once_cell::sync::Lazy;
@@ -18,7 +19,7 @@ use postgres;
use postgres::NoTls;
use postgres::error::SqlState;
use remote_storage::{DownloadError, RemotePath};
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::net::SocketAddr;
use std::os::unix::fs::{PermissionsExt, symlink};
use std::path::Path;
@@ -1995,23 +1996,40 @@ LIMIT 100",
tokio::spawn(conn);
// TODO: support other types of grants apart from schemas?
let query = format!(
"GRANT {} ON SCHEMA {} TO {}",
privileges
.iter()
// should not be quoted as it's part of the command.
// is already sanitized so it's ok
.map(|p| p.as_str())
.collect::<Vec<&'static str>>()
.join(", "),
// quote the schema and role name as identifiers to sanitize them.
schema_name.pg_quote(),
role_name.pg_quote(),
);
db_client
.simple_query(&query)
// check the role grants first - to gracefully handle read-replicas.
let select = "SELECT privilege_type
FROM pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
JOIN pg_user users ON acl.grantee = users.usesysid
WHERE users.usename = $1
AND nspname = $2";
let rows = db_client
.query(select, &[role_name, schema_name])
.await
.with_context(|| format!("Failed to execute query: {}", query))?;
.with_context(|| format!("Failed to execute query: {select}"))?;
let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
let grants = privileges
.iter()
.filter(|p| !already_granted.contains(p.as_str()))
// should not be quoted as it's part of the command.
// is already sanitized so it's ok
.map(|p| p.as_str())
.join(", ");
if !grants.is_empty() {
// quote the schema and role name as identifiers to sanitize them.
let schema_name = schema_name.pg_quote();
let role_name = role_name.pg_quote();
let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
db_client
.simple_query(&query)
.await
.with_context(|| format!("Failed to execute query: {}", query))?;
}
Ok(())
}

View File

@@ -1277,6 +1277,8 @@ impl Timeline {
return Ok(CompactionOutcome::YieldForL0);
}
let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
// 2. Repartition and create image layers if necessary
match self
.repartition(
@@ -1287,7 +1289,7 @@ impl Timeline {
)
.await
{
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::from(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
@@ -1341,6 +1343,10 @@ impl Timeline {
}
}
Ok(_) => {
info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
}
// Suppress errors when cancelled.
Err(_) if self.cancel.is_cancelled() => {}
Err(err) if err.is_cancel() => {}
@@ -3606,6 +3612,13 @@ impl Timeline {
last_key = Some(key);
}
accumulated_values.push((key, lsn, val));
if accumulated_values.len() >= 65536 {
// Assume all of them are images, that would be 512MB of data in memory for a single key.
return Err(CompactionError::Other(anyhow!(
"too many values for a single key, giving up gc-compaction"
)));
}
} else {
let last_key: &mut Key = last_key.as_mut().unwrap();
stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction

View File

@@ -425,15 +425,12 @@ compact_prefetch_buffers(void)
* point inside and outside PostgreSQL.
*
* This still does throw errors when it receives malformed responses from PS.
*
* When we're not called from CHECK_FOR_INTERRUPTS (indicated by
* IsHandlingInterrupts) we also report we've ended prefetch receive work,
* just in case state tracking was lost due to an error in the sync getPage
* response code.
*/
void
communicator_prefetch_pump_state(bool IsHandlingInterrupts)
communicator_prefetch_pump_state(void)
{
START_PREFETCH_RECEIVE_WORK();
while (MyPState->ring_receive != MyPState->ring_flush)
{
NeonResponse *response;
@@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts)
}
}
/* We never pump the prefetch state while handling other pages */
if (!IsHandlingInterrupts)
END_PREFETCH_RECEIVE_WORK();
END_PREFETCH_RECEIVE_WORK();
communicator_reconfigure_timeout_if_needed();
}
@@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index)
Assert(MyPState->ring_unused > ring_index);
START_PREFETCH_RECEIVE_WORK();
while (MyPState->ring_receive <= ring_index)
{
START_PREFETCH_RECEIVE_WORK();
entry = GetPrfSlot(MyPState->ring_receive);
Assert(entry->status == PRFS_REQUESTED);
@@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index)
result = false;
break;
}
END_PREFETCH_RECEIVE_WORK();
CHECK_FOR_INTERRUPTS();
}
if (result)
{
/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
PrefetchRequest *slot = GetPrfSlot(ring_index);
return slot->status == PRFS_RECEIVED;
result = slot->status == PRFS_RECEIVED;
}
return false;
END_PREFETCH_RECEIVE_WORK();
return result;
;
}
@@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot)
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_receive);
Assert(readpage_reentrant_guard);
if (slot->status != PRFS_REQUESTED ||
slot->response != NULL ||
@@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
PrfHashEntry *entry;
PrefetchRequest hashkey;
Assert(readpage_reentrant_guard);
hashkey.buftag = tag;
entry = prfh_lookup(MyPState->prf_hash, &hashkey);
if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
@@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag)
void
prefetch_on_ps_disconnect(void)
{
bool save_readpage_reentrant_guard = readpage_reentrant_guard;
MyPState->ring_flush = MyPState->ring_unused;
/* Prohibit callig of prefetch_pump_state */
START_PREFETCH_RECEIVE_WORK();
while (MyPState->ring_receive < MyPState->ring_unused)
{
PrefetchRequest *slot;
@@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void)
MyNeonCounters->getpage_prefetch_discards_total += 1;
}
/* Restore guard */
readpage_reentrant_guard = save_readpage_reentrant_guard;
/*
* We can have gone into retry due to network error, so update stats with
* the latest available
@@ -2509,7 +2515,7 @@ communicator_processinterrupts(void)
if (timeout_signaled)
{
if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
communicator_prefetch_pump_state(true);
communicator_prefetch_pump_state();
timeout_signaled = false;
communicator_reconfigure_timeout_if_needed();

View File

@@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
void *buffer);
extern void communicator_reconfigure_timeout_if_needed(void);
extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
extern void communicator_prefetch_pump_state(void);
#endif

View File

@@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum += iterblocks;
}
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
return false;
}
@@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
return false;
}
@@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
}
/* Try to read PS results if they are available */
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
@@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
nblocks, PG_IOV_MAX);
/* Try to read PS results if they are available */
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
@@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))

View File

@@ -3886,10 +3886,10 @@ impl Service {
None
} else if safekeepers {
// Note that we do not support creating the timeline on the safekeepers
// for imported timelines. The `start_lsn` of the timeline is not known
// until the import finshes.
// https://github.com/neondatabase/neon/issues/11569
// Note that for imported timelines, we do not create the timeline on the safekeepers
// straight away. Instead, we do it once the import finalized such that we know what
// start LSN to provide for the safekeepers. This is done in
// [`Self::finalize_timeline_import`].
let res = self
.tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
.instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id))
@@ -3966,11 +3966,22 @@ impl Service {
let active = self.timeline_active_on_all_shards(&import).await?;
match active {
true => {
Some(timeline_info) => {
tracing::info!("Timeline became active on all shards");
if self.config.timelines_onto_safekeepers {
// Now that we know the start LSN of this timeline, create it on the
// safekeepers.
self.tenant_timeline_create_safekeepers_until_success(
import.tenant_id,
timeline_info,
)
.await?;
}
break;
}
false => {
None => {
tracing::info!("Timeline not active on all shards yet");
tokio::select! {
@@ -4004,9 +4015,6 @@ impl Service {
.range_mut(TenantShardId::tenant_range(import.tenant_id))
.for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);
// TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn,
// so we can't create the timeline on the safekeepers. Fix by moving creation here.
// https://github.com/neondatabase/neon/issues/11569
tracing::info!(%import_failed, "Timeline import complete");
Ok(())
@@ -4021,10 +4029,16 @@ impl Service {
.await;
}
/// If the timeline is active on all shards, returns the [`TimelineInfo`]
/// collected from shard 0.
///
/// An error is returned if the shard layout has changed during the import.
/// This is guarded against within the storage controller and the pageserver,
/// and, therefore, unexpected.
async fn timeline_active_on_all_shards(
self: &Arc<Self>,
import: &TimelineImport,
) -> anyhow::Result<bool> {
) -> anyhow::Result<Option<TimelineInfo>> {
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
@@ -4048,13 +4062,17 @@ impl Service {
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
} else {
return Ok(false);
return Ok(None);
}
}
targets
};
if targets.is_empty() {
anyhow::bail!("No shards found to finalize import for");
}
let results = self
.tenant_for_shards_api(
targets,
@@ -4070,10 +4088,17 @@ impl Service {
)
.await;
Ok(results.into_iter().all(|res| match res {
let all_active = results.iter().all(|res| match res {
Ok(info) => info.state == TimelineState::Active,
Err(_) => false,
}))
});
if all_active {
// Both unwraps are validated above
Ok(Some(results.into_iter().next().unwrap().unwrap()))
} else {
Ok(None)
}
}
pub(crate) async fn tenant_timeline_archival_config(

View File

@@ -1,4 +1,9 @@
use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
use std::{
collections::HashMap,
str::FromStr,
sync::{Arc, atomic::AtomicU64},
time::Duration,
};
use clashmap::{ClashMap, Entry};
use safekeeper_api::models::PullTimelineRequest;
@@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest {
pub(crate) kind: SafekeeperTimelineOpKind,
}
/// A way to keep ongoing/queued reconcile requests apart
#[derive(Copy, Clone, PartialEq, Eq)]
struct TokenId(u64);
type OngoingTokens = ClashMap<(TenantId, Option<TimelineId>), (CancellationToken, TokenId)>;
/// Handle to per safekeeper reconciler.
struct ReconcilerHandle {
tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>,
ongoing_tokens: Arc<OngoingTokens>,
token_id_counter: AtomicU64,
cancel: CancellationToken,
}
@@ -185,24 +197,28 @@ impl ReconcilerHandle {
&self,
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
) -> CancellationToken {
) -> (CancellationToken, TokenId) {
let token_id = self
.token_id_counter
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let token_id = TokenId(token_id);
let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
if let Entry::Occupied(entry) = &entry {
let cancel: &CancellationToken = entry.get();
let (cancel, _) = entry.get();
cancel.cancel();
}
entry.insert(self.cancel.child_token()).clone()
entry.insert((self.cancel.child_token(), token_id)).clone()
}
/// Cancel an ongoing reconciliation
fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
cancel.cancel();
}
}
fn schedule_reconcile(&self, req: ScheduleRequest) {
let cancel = self.new_token_slot(req.tenant_id, req.timeline_id);
let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
let hostname = req.safekeeper.skp.host.clone();
if let Err(err) = self.tx.send((req, cancel)) {
if let Err(err) = self.tx.send((req, cancel, token_id)) {
tracing::info!("scheduling request onto {hostname} returned error: {err}");
}
}
@@ -211,13 +227,14 @@ impl ReconcilerHandle {
pub(crate) struct SafekeeperReconciler {
inner: SafekeeperReconcilerInner,
concurrency_limiter: Arc<Semaphore>,
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>,
cancel: CancellationToken,
}
/// Thin wrapper over `Service` to not clutter its inherent functions
#[derive(Clone)]
struct SafekeeperReconcilerInner {
ongoing_tokens: Arc<OngoingTokens>,
service: Arc<Service>,
}
@@ -226,15 +243,20 @@ impl SafekeeperReconciler {
// We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
let (tx, rx) = mpsc::unbounded_channel();
let concurrency = service.config.safekeeper_reconciler_concurrency;
let ongoing_tokens = Arc::new(ClashMap::new());
let mut reconciler = SafekeeperReconciler {
inner: SafekeeperReconcilerInner { service },
inner: SafekeeperReconcilerInner {
service,
ongoing_tokens: ongoing_tokens.clone(),
},
rx,
concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
cancel: cancel.clone(),
};
let handle = ReconcilerHandle {
tx,
ongoing_tokens: Arc::new(ClashMap::new()),
ongoing_tokens,
token_id_counter: AtomicU64::new(0),
cancel,
};
tokio::spawn(async move { reconciler.run().await });
@@ -246,7 +268,9 @@ impl SafekeeperReconciler {
req = self.rx.recv() => req,
_ = self.cancel.cancelled() => break,
};
let Some((req, req_cancel)) = req else { break };
let Some((req, req_cancel, req_token_id)) = req else {
break;
};
let permit_res = tokio::select! {
req = self.concurrency_limiter.clone().acquire_owned() => req,
@@ -265,7 +289,7 @@ impl SafekeeperReconciler {
let timeline_id = req.timeline_id;
let node_id = req.safekeeper.skp.id;
inner
.reconcile_one(req, req_cancel)
.reconcile_one(req, req_cancel, req_token_id)
.instrument(tracing::info_span!(
"reconcile_one",
?kind,
@@ -280,8 +304,14 @@ impl SafekeeperReconciler {
}
impl SafekeeperReconcilerInner {
async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
async fn reconcile_one(
&self,
req: ScheduleRequest,
req_cancel: CancellationToken,
req_token_id: TokenId,
) {
let req_host = req.safekeeper.skp.host.clone();
let success;
match req.kind {
SafekeeperTimelineOpKind::Pull => {
let Some(timeline_id) = req.timeline_id else {
@@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner {
tenant_id: req.tenant_id,
timeline_id,
};
self.reconcile_inner(
req,
async |client| client.pull_timeline(&pull_req).await,
|resp| {
if let Some(host) = resp.safekeeper_host {
tracing::info!("pulled timeline from {host} onto {req_host}");
} else {
tracing::info!("timeline already present on safekeeper on {req_host}");
}
},
req_cancel,
)
.await;
success = self
.reconcile_inner(
&req,
async |client| client.pull_timeline(&pull_req).await,
|resp| {
if let Some(host) = resp.safekeeper_host {
tracing::info!("pulled timeline from {host} onto {req_host}");
} else {
tracing::info!(
"timeline already present on safekeeper on {req_host}"
);
}
},
req_cancel,
)
.await;
}
SafekeeperTimelineOpKind::Exclude => {
// TODO actually exclude instead of delete here
@@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner {
);
return;
};
self.reconcile_inner(
req,
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|_resp| {
tracing::info!("deleted timeline from {req_host}");
},
req_cancel,
)
.await;
success = self
.reconcile_inner(
&req,
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|_resp| {
tracing::info!("deleted timeline from {req_host}");
},
req_cancel,
)
.await;
}
SafekeeperTimelineOpKind::Delete => {
let tenant_id = req.tenant_id;
if let Some(timeline_id) = req.timeline_id {
let deleted = self
success = self
.reconcile_inner(
req,
&req,
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|_resp| {
tracing::info!("deleted timeline from {req_host}");
@@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner {
req_cancel,
)
.await;
if deleted {
if success {
self.delete_timeline_from_db(tenant_id, timeline_id).await;
}
} else {
let deleted = self
success = self
.reconcile_inner(
req,
&req,
async |client| client.delete_tenant(tenant_id).await,
|_resp| {
tracing::info!(%tenant_id, "deleted tenant from {req_host}");
@@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner {
req_cancel,
)
.await;
if deleted {
if success {
self.delete_tenant_timelines_from_db(tenant_id).await;
}
}
}
}
if success {
self.ongoing_tokens.remove_if(
&(req.tenant_id, req.timeline_id),
|_ttid, (_cancel, token_id)| {
// Ensure that this request is indeed the request we just finished and not a new one
req_token_id == *token_id
},
);
}
}
async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
match self
@@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner {
self.delete_timeline_from_db(tenant_id, timeline_id).await;
}
}
/// Returns whether the reconciliation happened successfully
/// Returns whether the reconciliation happened successfully (or we got cancelled)
async fn reconcile_inner<T, F, U>(
&self,
req: ScheduleRequest,
req: &ScheduleRequest,
closure: impl Fn(SafekeeperClient) -> F,
log_success: impl FnOnce(T) -> U,
req_cancel: CancellationToken,

View File

@@ -323,6 +323,42 @@ impl Service {
})
}
pub(crate) async fn tenant_timeline_create_safekeepers_until_success(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_info: TimelineInfo,
) -> anyhow::Result<()> {
const BACKOFF: Duration = Duration::from_secs(5);
loop {
if self.cancel.is_cancelled() {
anyhow::bail!("Shut down requested while finalizing import");
}
let res = self
.tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
.await;
match res {
Ok(_) => {
tracing::info!("Timeline created on safekeepers");
break;
}
Err(err) => {
tracing::error!("Failed to create timeline on safekeepers: {err}");
tokio::select! {
_ = self.cancel.cancelled() => {
anyhow::bail!("Shut down requested while finalizing import");
},
_ = tokio::time::sleep(BACKOFF) => {}
};
}
}
}
Ok(())
}
/// Directly insert the timeline into the database without reconciling it with safekeepers.
///
/// Useful if the timeline already exists on the specified safekeepers,

View File

@@ -4613,7 +4613,10 @@ class EndpointFactory:
return self
def new_replica(
self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
self,
origin: Endpoint,
endpoint_id: str | None = None,
config_lines: list[str] | None = None,
):
branch_name = origin.branch_name
assert origin in self.endpoints
@@ -4629,7 +4632,10 @@ class EndpointFactory:
)
def new_replica_start(
self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
self,
origin: Endpoint,
endpoint_id: str | None = None,
config_lines: list[str] | None = None,
):
branch_name = origin.branch_name
assert origin in self.endpoints

View File

@@ -24,6 +24,7 @@ from fixtures.utils import (
skip_in_debug_build,
wait_until,
)
from fixtures.workload import Workload
from mypy_boto3_kms import KMSClient
from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
from mypy_boto3_s3 import S3Client
@@ -97,6 +98,10 @@ def test_pgdata_import_smoke(
f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
)
if neon_env_builder.storage_controller_config is None:
neon_env_builder.storage_controller_config = {}
neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True
env = neon_env_builder.init_start()
# The test needs LocalFs support, which is only built in testing mode.
@@ -286,34 +291,28 @@ def test_pgdata_import_smoke(
#
# validate that we can write
#
rw_endpoint = env.endpoints.create_start(
branch_name=import_branch_name,
endpoint_id="rw",
tenant_id=tenant_id,
config_lines=ep_config,
)
rw_endpoint.safe_psql("create table othertable(values text)")
rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name)
workload.init()
workload.write_rows(64)
workload.validate()
# TODO: consider using `class Workload` here
# to do compaction and whatnot?
rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()"))
#
# validate that we can branch (important use case)
#
# ... at the tip
_ = env.create_branch(
child_timeline_id = env.create_branch(
new_branch_name="br-tip",
ancestor_branch_name=import_branch_name,
tenant_id=tenant_id,
ancestor_start_lsn=rw_lsn,
)
br_tip_endpoint = env.endpoints.create_start(
branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
)
validate_vanilla_equivalence(br_tip_endpoint)
br_tip_endpoint.safe_psql("select * from othertable")
child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
child_workload.validate()
validate_vanilla_equivalence(child_workload.endpoint())
# ... at the initdb lsn
_ = env.create_branch(
@@ -330,7 +329,7 @@ def test_pgdata_import_smoke(
)
validate_vanilla_equivalence(br_initdb_endpoint)
with pytest.raises(psycopg2.errors.UndefinedTable):
br_initdb_endpoint.safe_psql("select * from othertable")
br_initdb_endpoint.safe_psql(f"select * from {workload.table}")
@run_only_on_default_postgres(reason="PG version is irrelevant here")

View File

@@ -248,8 +248,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
last_generation = max(
[s[1] for s in last_state.values() if s[1] is not None], default=None
)
is_last_generation = last_generation == generation
# It's also only valid to connect if there are no conflicting attachments (i.e. no
# Pageserver in AttachedSingle with other attachments in same generation).
num_attached_single = len(
[s for s in last_state.values() if s[0] == "AttachedSingle" and s[1] == last_generation]
)
num_attached = len(
[s for s in last_state.values() if s[0].startswith("Attached") and s[1] == last_generation]
)
valid_attachment = (num_attached_single == 1 and num_attached == 1) \
or num_attached_single == 0
if mode.startswith("Attached") and generation == last_generation:
if mode.startswith("Attached") and is_last_generation and valid_attachment:
# This is a basic test: we are validating that he endpoint works properly _between_
# configuration changes. A stronger test would be to validate that clients see
# no errors while we are making the changes.

View File

@@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv):
res = cur.fetchall()
assert res == [(1,)], "select should not succeed"
# confirm that replicas can also ensure the grants are correctly set.
replica = env.endpoints.new_replica_start(endpoint)
replica_client = replica.http_client()
replica_client.set_role_grants(
"test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
)

View File

@@ -5,7 +5,7 @@
],
"v16": [
"16.8",
"05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
"d72d76f2cdee4194dd052ce099e9784aca7c794a"
],
"v15": [
"15.12",