mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 19:40:39 +00:00
Compare commits
7 Commits
tristan957
...
erik/locat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fc9a358e1c | ||
|
|
33abfc2b74 | ||
|
|
93b964f829 | ||
|
|
d0aaec2abb | ||
|
|
d0dc65da12 | ||
|
|
03d635b916 | ||
|
|
5cd7f936f9 |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1303,6 +1303,7 @@ dependencies = [
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
|
||||
@@ -28,6 +28,7 @@ flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
indexmap.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
|
||||
@@ -11,6 +11,7 @@ use compute_api::spec::{
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -18,7 +19,7 @@ use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::SocketAddr;
|
||||
use std::os::unix::fs::{PermissionsExt, symlink};
|
||||
use std::path::Path;
|
||||
@@ -1995,23 +1996,40 @@ LIMIT 100",
|
||||
tokio::spawn(conn);
|
||||
|
||||
// TODO: support other types of grants apart from schemas?
|
||||
let query = format!(
|
||||
"GRANT {} ON SCHEMA {} TO {}",
|
||||
privileges
|
||||
.iter()
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.collect::<Vec<&'static str>>()
|
||||
.join(", "),
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
schema_name.pg_quote(),
|
||||
role_name.pg_quote(),
|
||||
);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
|
||||
// check the role grants first - to gracefully handle read-replicas.
|
||||
let select = "SELECT privilege_type
|
||||
FROM pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
|
||||
JOIN pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename = $1
|
||||
AND nspname = $2";
|
||||
let rows = db_client
|
||||
.query(select, &[role_name, schema_name])
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
.with_context(|| format!("Failed to execute query: {select}"))?;
|
||||
|
||||
let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
|
||||
|
||||
let grants = privileges
|
||||
.iter()
|
||||
.filter(|p| !already_granted.contains(p.as_str()))
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.join(", ");
|
||||
|
||||
if !grants.is_empty() {
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
let schema_name = schema_name.pg_quote();
|
||||
let role_name = role_name.pg_quote();
|
||||
|
||||
let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1277,6 +1277,8 @@ impl Timeline {
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
|
||||
let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
|
||||
|
||||
// 2. Repartition and create image layers if necessary
|
||||
match self
|
||||
.repartition(
|
||||
@@ -1287,7 +1289,7 @@ impl Timeline {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::from(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
@@ -1341,6 +1343,10 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(_) => {
|
||||
info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
|
||||
}
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
Err(err) if err.is_cancel() => {}
|
||||
@@ -3606,6 +3612,13 @@ impl Timeline {
|
||||
last_key = Some(key);
|
||||
}
|
||||
accumulated_values.push((key, lsn, val));
|
||||
|
||||
if accumulated_values.len() >= 65536 {
|
||||
// Assume all of them are images, that would be 512MB of data in memory for a single key.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"too many values for a single key, giving up gc-compaction"
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
let last_key: &mut Key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
|
||||
|
||||
@@ -425,15 +425,12 @@ compact_prefetch_buffers(void)
|
||||
* point inside and outside PostgreSQL.
|
||||
*
|
||||
* This still does throw errors when it receives malformed responses from PS.
|
||||
*
|
||||
* When we're not called from CHECK_FOR_INTERRUPTS (indicated by
|
||||
* IsHandlingInterrupts) we also report we've ended prefetch receive work,
|
||||
* just in case state tracking was lost due to an error in the sync getPage
|
||||
* response code.
|
||||
*/
|
||||
void
|
||||
communicator_prefetch_pump_state(bool IsHandlingInterrupts)
|
||||
communicator_prefetch_pump_state(void)
|
||||
{
|
||||
START_PREFETCH_RECEIVE_WORK();
|
||||
|
||||
while (MyPState->ring_receive != MyPState->ring_flush)
|
||||
{
|
||||
NeonResponse *response;
|
||||
@@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts)
|
||||
}
|
||||
}
|
||||
|
||||
/* We never pump the prefetch state while handling other pages */
|
||||
if (!IsHandlingInterrupts)
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
|
||||
communicator_reconfigure_timeout_if_needed();
|
||||
}
|
||||
@@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index)
|
||||
|
||||
Assert(MyPState->ring_unused > ring_index);
|
||||
|
||||
START_PREFETCH_RECEIVE_WORK();
|
||||
|
||||
while (MyPState->ring_receive <= ring_index)
|
||||
{
|
||||
START_PREFETCH_RECEIVE_WORK();
|
||||
entry = GetPrfSlot(MyPState->ring_receive);
|
||||
|
||||
Assert(entry->status == PRFS_REQUESTED);
|
||||
@@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index)
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
if (result)
|
||||
{
|
||||
/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
|
||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
||||
return slot->status == PRFS_RECEIVED;
|
||||
result = slot->status == PRFS_RECEIVED;
|
||||
}
|
||||
return false;
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
|
||||
return result;
|
||||
;
|
||||
}
|
||||
|
||||
@@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->status == PRFS_REQUESTED);
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
Assert(readpage_reentrant_guard);
|
||||
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
@@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
|
||||
PrfHashEntry *entry;
|
||||
PrefetchRequest hashkey;
|
||||
|
||||
Assert(readpage_reentrant_guard);
|
||||
hashkey.buftag = tag;
|
||||
entry = prfh_lookup(MyPState->prf_hash, &hashkey);
|
||||
if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
|
||||
@@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag)
|
||||
void
|
||||
prefetch_on_ps_disconnect(void)
|
||||
{
|
||||
bool save_readpage_reentrant_guard = readpage_reentrant_guard;
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
|
||||
/* Prohibit callig of prefetch_pump_state */
|
||||
START_PREFETCH_RECEIVE_WORK();
|
||||
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
@@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void)
|
||||
MyNeonCounters->getpage_prefetch_discards_total += 1;
|
||||
}
|
||||
|
||||
/* Restore guard */
|
||||
readpage_reentrant_guard = save_readpage_reentrant_guard;
|
||||
|
||||
/*
|
||||
* We can have gone into retry due to network error, so update stats with
|
||||
* the latest available
|
||||
@@ -2509,7 +2515,7 @@ communicator_processinterrupts(void)
|
||||
if (timeout_signaled)
|
||||
{
|
||||
if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
|
||||
communicator_prefetch_pump_state(true);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
timeout_signaled = false;
|
||||
communicator_reconfigure_timeout_if_needed();
|
||||
|
||||
@@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
|
||||
void *buffer);
|
||||
|
||||
extern void communicator_reconfigure_timeout_if_needed(void);
|
||||
extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
|
||||
extern void communicator_prefetch_pump_state(void);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
blocknum += iterblocks;
|
||||
}
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
|
||||
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
}
|
||||
|
||||
/* Try to read PS results if they are available */
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
|
||||
|
||||
@@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
@@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
nblocks, PG_IOV_MAX);
|
||||
|
||||
/* Try to read PS results if they are available */
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
request_lsns, nblocks);
|
||||
@@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
@@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
|
||||
@@ -3886,10 +3886,10 @@ impl Service {
|
||||
|
||||
None
|
||||
} else if safekeepers {
|
||||
// Note that we do not support creating the timeline on the safekeepers
|
||||
// for imported timelines. The `start_lsn` of the timeline is not known
|
||||
// until the import finshes.
|
||||
// https://github.com/neondatabase/neon/issues/11569
|
||||
// Note that for imported timelines, we do not create the timeline on the safekeepers
|
||||
// straight away. Instead, we do it once the import finalized such that we know what
|
||||
// start LSN to provide for the safekeepers. This is done in
|
||||
// [`Self::finalize_timeline_import`].
|
||||
let res = self
|
||||
.tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
|
||||
.instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id))
|
||||
@@ -3966,11 +3966,22 @@ impl Service {
|
||||
let active = self.timeline_active_on_all_shards(&import).await?;
|
||||
|
||||
match active {
|
||||
true => {
|
||||
Some(timeline_info) => {
|
||||
tracing::info!("Timeline became active on all shards");
|
||||
|
||||
if self.config.timelines_onto_safekeepers {
|
||||
// Now that we know the start LSN of this timeline, create it on the
|
||||
// safekeepers.
|
||||
self.tenant_timeline_create_safekeepers_until_success(
|
||||
import.tenant_id,
|
||||
timeline_info,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
false => {
|
||||
None => {
|
||||
tracing::info!("Timeline not active on all shards yet");
|
||||
|
||||
tokio::select! {
|
||||
@@ -4004,9 +4015,6 @@ impl Service {
|
||||
.range_mut(TenantShardId::tenant_range(import.tenant_id))
|
||||
.for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);
|
||||
|
||||
// TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn,
|
||||
// so we can't create the timeline on the safekeepers. Fix by moving creation here.
|
||||
// https://github.com/neondatabase/neon/issues/11569
|
||||
tracing::info!(%import_failed, "Timeline import complete");
|
||||
|
||||
Ok(())
|
||||
@@ -4021,10 +4029,16 @@ impl Service {
|
||||
.await;
|
||||
}
|
||||
|
||||
/// If the timeline is active on all shards, returns the [`TimelineInfo`]
|
||||
/// collected from shard 0.
|
||||
///
|
||||
/// An error is returned if the shard layout has changed during the import.
|
||||
/// This is guarded against within the storage controller and the pageserver,
|
||||
/// and, therefore, unexpected.
|
||||
async fn timeline_active_on_all_shards(
|
||||
self: &Arc<Self>,
|
||||
import: &TimelineImport,
|
||||
) -> anyhow::Result<bool> {
|
||||
) -> anyhow::Result<Option<TimelineInfo>> {
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
@@ -4048,13 +4062,17 @@ impl Service {
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
} else {
|
||||
return Ok(false);
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
targets
|
||||
};
|
||||
|
||||
if targets.is_empty() {
|
||||
anyhow::bail!("No shards found to finalize import for");
|
||||
}
|
||||
|
||||
let results = self
|
||||
.tenant_for_shards_api(
|
||||
targets,
|
||||
@@ -4070,10 +4088,17 @@ impl Service {
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(results.into_iter().all(|res| match res {
|
||||
let all_active = results.iter().all(|res| match res {
|
||||
Ok(info) => info.state == TimelineState::Active,
|
||||
Err(_) => false,
|
||||
}))
|
||||
});
|
||||
|
||||
if all_active {
|
||||
// Both unwraps are validated above
|
||||
Ok(Some(results.into_iter().next().unwrap().unwrap()))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_archival_config(
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
str::FromStr,
|
||||
sync::{Arc, atomic::AtomicU64},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clashmap::{ClashMap, Entry};
|
||||
use safekeeper_api::models::PullTimelineRequest;
|
||||
@@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest {
|
||||
pub(crate) kind: SafekeeperTimelineOpKind,
|
||||
}
|
||||
|
||||
/// A way to keep ongoing/queued reconcile requests apart
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
struct TokenId(u64);
|
||||
|
||||
type OngoingTokens = ClashMap<(TenantId, Option<TimelineId>), (CancellationToken, TokenId)>;
|
||||
|
||||
/// Handle to per safekeeper reconciler.
|
||||
struct ReconcilerHandle {
|
||||
tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
|
||||
ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
|
||||
tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>,
|
||||
ongoing_tokens: Arc<OngoingTokens>,
|
||||
token_id_counter: AtomicU64,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
@@ -185,24 +197,28 @@ impl ReconcilerHandle {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
) -> CancellationToken {
|
||||
) -> (CancellationToken, TokenId) {
|
||||
let token_id = self
|
||||
.token_id_counter
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let token_id = TokenId(token_id);
|
||||
let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
|
||||
if let Entry::Occupied(entry) = &entry {
|
||||
let cancel: &CancellationToken = entry.get();
|
||||
let (cancel, _) = entry.get();
|
||||
cancel.cancel();
|
||||
}
|
||||
entry.insert(self.cancel.child_token()).clone()
|
||||
entry.insert((self.cancel.child_token(), token_id)).clone()
|
||||
}
|
||||
/// Cancel an ongoing reconciliation
|
||||
fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
|
||||
if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
|
||||
if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
|
||||
cancel.cancel();
|
||||
}
|
||||
}
|
||||
fn schedule_reconcile(&self, req: ScheduleRequest) {
|
||||
let cancel = self.new_token_slot(req.tenant_id, req.timeline_id);
|
||||
let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
|
||||
let hostname = req.safekeeper.skp.host.clone();
|
||||
if let Err(err) = self.tx.send((req, cancel)) {
|
||||
if let Err(err) = self.tx.send((req, cancel, token_id)) {
|
||||
tracing::info!("scheduling request onto {hostname} returned error: {err}");
|
||||
}
|
||||
}
|
||||
@@ -211,13 +227,14 @@ impl ReconcilerHandle {
|
||||
pub(crate) struct SafekeeperReconciler {
|
||||
inner: SafekeeperReconcilerInner,
|
||||
concurrency_limiter: Arc<Semaphore>,
|
||||
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
|
||||
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// Thin wrapper over `Service` to not clutter its inherent functions
|
||||
#[derive(Clone)]
|
||||
struct SafekeeperReconcilerInner {
|
||||
ongoing_tokens: Arc<OngoingTokens>,
|
||||
service: Arc<Service>,
|
||||
}
|
||||
|
||||
@@ -226,15 +243,20 @@ impl SafekeeperReconciler {
|
||||
// We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let concurrency = service.config.safekeeper_reconciler_concurrency;
|
||||
let ongoing_tokens = Arc::new(ClashMap::new());
|
||||
let mut reconciler = SafekeeperReconciler {
|
||||
inner: SafekeeperReconcilerInner { service },
|
||||
inner: SafekeeperReconcilerInner {
|
||||
service,
|
||||
ongoing_tokens: ongoing_tokens.clone(),
|
||||
},
|
||||
rx,
|
||||
concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
|
||||
cancel: cancel.clone(),
|
||||
};
|
||||
let handle = ReconcilerHandle {
|
||||
tx,
|
||||
ongoing_tokens: Arc::new(ClashMap::new()),
|
||||
ongoing_tokens,
|
||||
token_id_counter: AtomicU64::new(0),
|
||||
cancel,
|
||||
};
|
||||
tokio::spawn(async move { reconciler.run().await });
|
||||
@@ -246,7 +268,9 @@ impl SafekeeperReconciler {
|
||||
req = self.rx.recv() => req,
|
||||
_ = self.cancel.cancelled() => break,
|
||||
};
|
||||
let Some((req, req_cancel)) = req else { break };
|
||||
let Some((req, req_cancel, req_token_id)) = req else {
|
||||
break;
|
||||
};
|
||||
|
||||
let permit_res = tokio::select! {
|
||||
req = self.concurrency_limiter.clone().acquire_owned() => req,
|
||||
@@ -265,7 +289,7 @@ impl SafekeeperReconciler {
|
||||
let timeline_id = req.timeline_id;
|
||||
let node_id = req.safekeeper.skp.id;
|
||||
inner
|
||||
.reconcile_one(req, req_cancel)
|
||||
.reconcile_one(req, req_cancel, req_token_id)
|
||||
.instrument(tracing::info_span!(
|
||||
"reconcile_one",
|
||||
?kind,
|
||||
@@ -280,8 +304,14 @@ impl SafekeeperReconciler {
|
||||
}
|
||||
|
||||
impl SafekeeperReconcilerInner {
|
||||
async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
|
||||
async fn reconcile_one(
|
||||
&self,
|
||||
req: ScheduleRequest,
|
||||
req_cancel: CancellationToken,
|
||||
req_token_id: TokenId,
|
||||
) {
|
||||
let req_host = req.safekeeper.skp.host.clone();
|
||||
let success;
|
||||
match req.kind {
|
||||
SafekeeperTimelineOpKind::Pull => {
|
||||
let Some(timeline_id) = req.timeline_id else {
|
||||
@@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner {
|
||||
tenant_id: req.tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
self.reconcile_inner(
|
||||
req,
|
||||
async |client| client.pull_timeline(&pull_req).await,
|
||||
|resp| {
|
||||
if let Some(host) = resp.safekeeper_host {
|
||||
tracing::info!("pulled timeline from {host} onto {req_host}");
|
||||
} else {
|
||||
tracing::info!("timeline already present on safekeeper on {req_host}");
|
||||
}
|
||||
},
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
success = self
|
||||
.reconcile_inner(
|
||||
&req,
|
||||
async |client| client.pull_timeline(&pull_req).await,
|
||||
|resp| {
|
||||
if let Some(host) = resp.safekeeper_host {
|
||||
tracing::info!("pulled timeline from {host} onto {req_host}");
|
||||
} else {
|
||||
tracing::info!(
|
||||
"timeline already present on safekeeper on {req_host}"
|
||||
);
|
||||
}
|
||||
},
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
SafekeeperTimelineOpKind::Exclude => {
|
||||
// TODO actually exclude instead of delete here
|
||||
@@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner {
|
||||
);
|
||||
return;
|
||||
};
|
||||
self.reconcile_inner(
|
||||
req,
|
||||
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|
||||
|_resp| {
|
||||
tracing::info!("deleted timeline from {req_host}");
|
||||
},
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
success = self
|
||||
.reconcile_inner(
|
||||
&req,
|
||||
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|
||||
|_resp| {
|
||||
tracing::info!("deleted timeline from {req_host}");
|
||||
},
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
SafekeeperTimelineOpKind::Delete => {
|
||||
let tenant_id = req.tenant_id;
|
||||
if let Some(timeline_id) = req.timeline_id {
|
||||
let deleted = self
|
||||
success = self
|
||||
.reconcile_inner(
|
||||
req,
|
||||
&req,
|
||||
async |client| client.delete_timeline(tenant_id, timeline_id).await,
|
||||
|_resp| {
|
||||
tracing::info!("deleted timeline from {req_host}");
|
||||
@@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner {
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
if deleted {
|
||||
if success {
|
||||
self.delete_timeline_from_db(tenant_id, timeline_id).await;
|
||||
}
|
||||
} else {
|
||||
let deleted = self
|
||||
success = self
|
||||
.reconcile_inner(
|
||||
req,
|
||||
&req,
|
||||
async |client| client.delete_tenant(tenant_id).await,
|
||||
|_resp| {
|
||||
tracing::info!(%tenant_id, "deleted tenant from {req_host}");
|
||||
@@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner {
|
||||
req_cancel,
|
||||
)
|
||||
.await;
|
||||
if deleted {
|
||||
if success {
|
||||
self.delete_tenant_timelines_from_db(tenant_id).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if success {
|
||||
self.ongoing_tokens.remove_if(
|
||||
&(req.tenant_id, req.timeline_id),
|
||||
|_ttid, (_cancel, token_id)| {
|
||||
// Ensure that this request is indeed the request we just finished and not a new one
|
||||
req_token_id == *token_id
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
|
||||
match self
|
||||
@@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner {
|
||||
self.delete_timeline_from_db(tenant_id, timeline_id).await;
|
||||
}
|
||||
}
|
||||
/// Returns whether the reconciliation happened successfully
|
||||
/// Returns whether the reconciliation happened successfully (or we got cancelled)
|
||||
async fn reconcile_inner<T, F, U>(
|
||||
&self,
|
||||
req: ScheduleRequest,
|
||||
req: &ScheduleRequest,
|
||||
closure: impl Fn(SafekeeperClient) -> F,
|
||||
log_success: impl FnOnce(T) -> U,
|
||||
req_cancel: CancellationToken,
|
||||
|
||||
@@ -323,6 +323,42 @@ impl Service {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_create_safekeepers_until_success(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: TenantId,
|
||||
timeline_info: TimelineInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
const BACKOFF: Duration = Duration::from_secs(5);
|
||||
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
anyhow::bail!("Shut down requested while finalizing import");
|
||||
}
|
||||
|
||||
let res = self
|
||||
.tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(_) => {
|
||||
tracing::info!("Timeline created on safekeepers");
|
||||
break;
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!("Failed to create timeline on safekeepers: {err}");
|
||||
tokio::select! {
|
||||
_ = self.cancel.cancelled() => {
|
||||
anyhow::bail!("Shut down requested while finalizing import");
|
||||
},
|
||||
_ = tokio::time::sleep(BACKOFF) => {}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Directly insert the timeline into the database without reconciling it with safekeepers.
|
||||
///
|
||||
/// Useful if the timeline already exists on the specified safekeepers,
|
||||
|
||||
@@ -4613,7 +4613,10 @@ class EndpointFactory:
|
||||
return self
|
||||
|
||||
def new_replica(
|
||||
self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
|
||||
self,
|
||||
origin: Endpoint,
|
||||
endpoint_id: str | None = None,
|
||||
config_lines: list[str] | None = None,
|
||||
):
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
@@ -4629,7 +4632,10 @@ class EndpointFactory:
|
||||
)
|
||||
|
||||
def new_replica_start(
|
||||
self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
|
||||
self,
|
||||
origin: Endpoint,
|
||||
endpoint_id: str | None = None,
|
||||
config_lines: list[str] | None = None,
|
||||
):
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
|
||||
@@ -24,6 +24,7 @@ from fixtures.utils import (
|
||||
skip_in_debug_build,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.workload import Workload
|
||||
from mypy_boto3_kms import KMSClient
|
||||
from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
|
||||
from mypy_boto3_s3 import S3Client
|
||||
@@ -97,6 +98,10 @@ def test_pgdata_import_smoke(
|
||||
f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
|
||||
)
|
||||
|
||||
if neon_env_builder.storage_controller_config is None:
|
||||
neon_env_builder.storage_controller_config = {}
|
||||
neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# The test needs LocalFs support, which is only built in testing mode.
|
||||
@@ -286,34 +291,28 @@ def test_pgdata_import_smoke(
|
||||
#
|
||||
# validate that we can write
|
||||
#
|
||||
rw_endpoint = env.endpoints.create_start(
|
||||
branch_name=import_branch_name,
|
||||
endpoint_id="rw",
|
||||
tenant_id=tenant_id,
|
||||
config_lines=ep_config,
|
||||
)
|
||||
rw_endpoint.safe_psql("create table othertable(values text)")
|
||||
rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
|
||||
workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name)
|
||||
workload.init()
|
||||
workload.write_rows(64)
|
||||
workload.validate()
|
||||
|
||||
# TODO: consider using `class Workload` here
|
||||
# to do compaction and whatnot?
|
||||
rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()"))
|
||||
|
||||
#
|
||||
# validate that we can branch (important use case)
|
||||
#
|
||||
|
||||
# ... at the tip
|
||||
_ = env.create_branch(
|
||||
child_timeline_id = env.create_branch(
|
||||
new_branch_name="br-tip",
|
||||
ancestor_branch_name=import_branch_name,
|
||||
tenant_id=tenant_id,
|
||||
ancestor_start_lsn=rw_lsn,
|
||||
)
|
||||
br_tip_endpoint = env.endpoints.create_start(
|
||||
branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
|
||||
)
|
||||
validate_vanilla_equivalence(br_tip_endpoint)
|
||||
br_tip_endpoint.safe_psql("select * from othertable")
|
||||
child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
|
||||
child_workload.validate()
|
||||
|
||||
validate_vanilla_equivalence(child_workload.endpoint())
|
||||
|
||||
# ... at the initdb lsn
|
||||
_ = env.create_branch(
|
||||
@@ -330,7 +329,7 @@ def test_pgdata_import_smoke(
|
||||
)
|
||||
validate_vanilla_equivalence(br_initdb_endpoint)
|
||||
with pytest.raises(psycopg2.errors.UndefinedTable):
|
||||
br_initdb_endpoint.safe_psql("select * from othertable")
|
||||
br_initdb_endpoint.safe_psql(f"select * from {workload.table}")
|
||||
|
||||
|
||||
@run_only_on_default_postgres(reason="PG version is irrelevant here")
|
||||
|
||||
@@ -248,8 +248,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
|
||||
last_generation = max(
|
||||
[s[1] for s in last_state.values() if s[1] is not None], default=None
|
||||
)
|
||||
is_last_generation = last_generation == generation
|
||||
|
||||
# It's also only valid to connect if there are no conflicting attachments (i.e. no
|
||||
# Pageserver in AttachedSingle with other attachments in same generation).
|
||||
num_attached_single = len(
|
||||
[s for s in last_state.values() if s[0] == "AttachedSingle" and s[1] == last_generation]
|
||||
)
|
||||
num_attached = len(
|
||||
[s for s in last_state.values() if s[0].startswith("Attached") and s[1] == last_generation]
|
||||
)
|
||||
valid_attachment = (num_attached_single == 1 and num_attached == 1) \
|
||||
or num_attached_single == 0
|
||||
|
||||
if mode.startswith("Attached") and generation == last_generation:
|
||||
if mode.startswith("Attached") and is_last_generation and valid_attachment:
|
||||
# This is a basic test: we are validating that he endpoint works properly _between_
|
||||
# configuration changes. A stronger test would be to validate that clients see
|
||||
# no errors while we are making the changes.
|
||||
|
||||
@@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv):
|
||||
res = cur.fetchall()
|
||||
|
||||
assert res == [(1,)], "select should not succeed"
|
||||
|
||||
# confirm that replicas can also ensure the grants are correctly set.
|
||||
replica = env.endpoints.new_replica_start(endpoint)
|
||||
replica_client = replica.http_client()
|
||||
replica_client.set_role_grants(
|
||||
"test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
|
||||
)
|
||||
|
||||
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 05ddf212e2...d72d76f2cd
2
vendor/revisions.json
vendored
2
vendor/revisions.json
vendored
@@ -5,7 +5,7 @@
|
||||
],
|
||||
"v16": [
|
||||
"16.8",
|
||||
"05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
|
||||
"d72d76f2cdee4194dd052ce099e9784aca7c794a"
|
||||
],
|
||||
"v15": [
|
||||
"15.12",
|
||||
|
||||
Reference in New Issue
Block a user