mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 22:50:38 +00:00
Compare commits
19 Commits
problame/a
...
funbringer
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
389dfb6809 | ||
|
|
b2bdd1b117 | ||
|
|
0b42ba9615 | ||
|
|
6d3be4cf7d | ||
|
|
7605aed5aa | ||
|
|
e13ae811bf | ||
|
|
36e86e370b | ||
|
|
8e51d5d8f3 | ||
|
|
72131e15ff | ||
|
|
0495043d30 | ||
|
|
c87a742bbd | ||
|
|
c7c514aae4 | ||
|
|
f7b0beb1cf | ||
|
|
b5b0a2e872 | ||
|
|
cd787070f6 | ||
|
|
05d4a85c2f | ||
|
|
9a35ae40b5 | ||
|
|
3d33ea465d | ||
|
|
fd50c95b9d |
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -2727,7 +2727,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
@@ -5425,7 +5424,6 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"parking_lot",
|
||||
"prost",
|
||||
"rand",
|
||||
"regex",
|
||||
|
||||
@@ -60,6 +60,10 @@ pub mod tracing_span_assert;
|
||||
|
||||
pub mod rate_limit;
|
||||
|
||||
/// Primitive for coalescing operations into a single task which will not be cancelled by for
|
||||
/// example external http client closing the connection.
|
||||
pub mod shared_retryable;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
|
||||
659
libs/utils/src/shared_retryable.rs
Normal file
659
libs/utils/src/shared_retryable.rs
Normal file
@@ -0,0 +1,659 @@
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Container using which many request handlers can come together and join a single task to
|
||||
/// completion instead of racing each other and their own cancellation.
|
||||
///
|
||||
/// In a picture:
|
||||
///
|
||||
/// ```text
|
||||
/// SharedRetryable::try_restart Spawned task completes with only one concurrent attempt
|
||||
/// \ /
|
||||
/// request handler 1 ---->|--X
|
||||
/// request handler 2 ---->|-------|
|
||||
/// request handler 3 ---->|-------|
|
||||
/// | |
|
||||
/// v |
|
||||
/// one spawned task \------>/
|
||||
///
|
||||
/// (X = cancelled during await)
|
||||
/// ```
|
||||
///
|
||||
/// Implementation is cancel safe. Implementation and internal structure are hurt by the inability
|
||||
/// to just spawn the task, but this is needed for pageserver usage.
|
||||
///
|
||||
/// Implementation exposes a fully decomposed [`SharedRetryable::try_restart`] which requires the
|
||||
/// caller to do the spawning before awaiting for the result. If the caller is dropped while this
|
||||
/// happens, a new attempt will be required, and all concurrent awaiters will see a
|
||||
/// [`RetriedTaskPanicked`] error.
|
||||
///
|
||||
/// There is another "family of APIs" [`SharedRetryable::attempt_spawn`] for infallible futures. It is
|
||||
/// just provided for completeness, and it does not have a fully decomposed version like
|
||||
/// `try_restart`.
|
||||
///
|
||||
/// For `try_restart_*` family of APIs, there is a concept of two leveled results. The inner level
|
||||
/// is returned by the executed future. It needs to be `Clone`. Most errors are not `Clone`, so
|
||||
/// implementation advice is to log the happened error, and not propagate more than a label as the
|
||||
/// "inner error" which will be used to build an outer error. The outer error will also have to be
|
||||
/// convertable from [`RetriedTaskPanicked`] to absorb that case as well.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
/// A shared service value completes the infallible work once, even if called concurrently by
|
||||
/// multiple cancellable tasks.
|
||||
///
|
||||
/// Example moved as a test `service_example`.
|
||||
#[derive(Clone)]
|
||||
pub struct SharedRetryable<V> {
|
||||
inner: Arc<tokio::sync::Mutex<Option<MaybeDone<V>>>>,
|
||||
}
|
||||
|
||||
impl<V> Default for SharedRetryable<V> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Arc::new(tokio::sync::Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine if an error is transient or permanent.
|
||||
pub trait Retryable {
|
||||
fn is_permanent(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
pub trait MakeFuture {
|
||||
type Future: Future<Output = Self::Output> + Send + 'static;
|
||||
type Output: Send + 'static;
|
||||
|
||||
fn make_future(self) -> Self::Future;
|
||||
}
|
||||
|
||||
impl<Fun, Fut, R> MakeFuture for Fun
|
||||
where
|
||||
Fun: FnOnce() -> Fut,
|
||||
Fut: Future<Output = R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
type Future = Fut;
|
||||
type Output = R;
|
||||
|
||||
fn make_future(self) -> Self::Future {
|
||||
self()
|
||||
}
|
||||
}
|
||||
|
||||
/// Retried task panicked, was cancelled, or never spawned (see [`SharedRetryable::try_restart`]).
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct RetriedTaskPanicked;
|
||||
|
||||
impl<T, E1> SharedRetryable<Result<T, E1>>
|
||||
where
|
||||
T: Clone + std::fmt::Debug + Send + 'static,
|
||||
E1: Retryable + Clone + std::fmt::Debug + Send + 'static,
|
||||
{
|
||||
/// Restart a previously failed operation unless it already completed with a terminal result.
|
||||
///
|
||||
/// Many futures can call this function and and get the terminal result from an earlier attempt
|
||||
/// or start a new attempt, or join an existing one.
|
||||
///
|
||||
/// Compared to `Self::try_restart`, this method also spawns the future to run, which would
|
||||
/// otherwise have to be done manually.
|
||||
#[cfg(test)]
|
||||
pub async fn try_restart_spawn<E2>(
|
||||
&self,
|
||||
retry_with: impl MakeFuture<Output = Result<T, E1>>,
|
||||
) -> Result<T, E2>
|
||||
where
|
||||
E2: From<E1> + From<RetriedTaskPanicked> + Send + 'static,
|
||||
{
|
||||
let (recv, maybe_fut) = self.try_restart(retry_with).await;
|
||||
|
||||
if let Some(fut) = maybe_fut {
|
||||
// top level function, we must spawn, pageserver cannot use this
|
||||
tokio::spawn(fut);
|
||||
}
|
||||
|
||||
recv.await
|
||||
}
|
||||
|
||||
/// Restart a previously failed operation unless it already completed with a terminal result.
|
||||
///
|
||||
/// Many futures can call this function and get the terminal result from an earlier attempt or
|
||||
/// start a new attempt, or join an existing one.
|
||||
///
|
||||
/// If a task calling this method is cancelled, at worst, a new attempt which is immediatedly
|
||||
/// deemed as having panicked will happen, but without a panic ever happening.
|
||||
///
|
||||
/// Returns one future for waiting for the result and possibly another which needs to be
|
||||
/// spawned when `Some`. Spawning has to happen before waiting is started, otherwise the first
|
||||
/// future will never make progress.
|
||||
///
|
||||
/// This complication exists because on pageserver we cannot use `tokio::spawn` directly
|
||||
/// at this time.
|
||||
pub async fn try_restart<E2>(
|
||||
&self,
|
||||
retry_with: impl MakeFuture<Output = Result<T, E1>>,
|
||||
) -> (
|
||||
impl Future<Output = Result<T, E2>> + Send + 'static,
|
||||
Option<impl Future<Output = ()> + Send + 'static>,
|
||||
)
|
||||
where
|
||||
E2: From<E1> + From<RetriedTaskPanicked> + Send + 'static,
|
||||
{
|
||||
use futures::future::Either;
|
||||
|
||||
match self.decide_to_retry_or_join(retry_with).await {
|
||||
Ok(terminal) => (Either::Left(async move { terminal }), None),
|
||||
Err((rx, maybe_fut)) => {
|
||||
let recv = Self::make_oneshot_alike_receiver(rx);
|
||||
|
||||
(Either::Right(recv), maybe_fut)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a Ok if the previous attempt had resulted in a terminal result. Err is returned
|
||||
/// when an attempt can be joined and possibly needs to be spawned.
|
||||
async fn decide_to_retry_or_join<E2>(
|
||||
&self,
|
||||
retry_with: impl MakeFuture<Output = Result<T, E1>>,
|
||||
) -> Result<
|
||||
Result<T, E2>,
|
||||
(
|
||||
tokio::sync::broadcast::Receiver<Result<T, E1>>,
|
||||
Option<impl Future<Output = ()> + Send + 'static>,
|
||||
),
|
||||
>
|
||||
where
|
||||
E2: From<E1> + From<RetriedTaskPanicked>,
|
||||
{
|
||||
let mut g = self.inner.lock().await;
|
||||
|
||||
let maybe_rx = match g.as_ref() {
|
||||
Some(MaybeDone::Done(Ok(t))) => return Ok(Ok(t.to_owned())),
|
||||
Some(MaybeDone::Done(Err(e))) if e.is_permanent() => {
|
||||
return Ok(Err(E2::from(e.to_owned())))
|
||||
}
|
||||
Some(MaybeDone::Pending(weak)) => {
|
||||
// failure to upgrade can mean only one thing: there was an unexpected
|
||||
// panic which we consider as a transient retryable error.
|
||||
weak.upgrade()
|
||||
}
|
||||
Some(MaybeDone::Done(Err(_retryable))) => None,
|
||||
None => None,
|
||||
};
|
||||
|
||||
let (strong, maybe_fut) = match maybe_rx {
|
||||
Some(strong) => (strong, None),
|
||||
None => {
|
||||
// new attempt
|
||||
// panic safety: invoke the factory before configuring the pending value
|
||||
let fut = retry_with.make_future();
|
||||
|
||||
let (strong, fut) = self.make_run_and_complete(fut, &mut g);
|
||||
(strong, Some(fut))
|
||||
}
|
||||
};
|
||||
|
||||
// important: the Arc<Receiver> is not held after unlocking
|
||||
// important: we resubscribe before lock is released to be sure to get a message which
|
||||
// is sent once receiver is dropped
|
||||
let rx = strong.resubscribe();
|
||||
drop(strong);
|
||||
Err((rx, maybe_fut))
|
||||
}
|
||||
|
||||
/// Configure a new attempt, but leave spawning it to the caller.
|
||||
///
|
||||
/// Returns an `Arc<Receiver<V>>` which is valid until the attempt completes, and the future
|
||||
/// which will need to run to completion outside the lifecycle of the caller.
|
||||
fn make_run_and_complete(
|
||||
&self,
|
||||
fut: impl Future<Output = Result<T, E1>> + Send + 'static,
|
||||
g: &mut tokio::sync::MutexGuard<'_, Option<MaybeDone<Result<T, E1>>>>,
|
||||
) -> (
|
||||
Arc<tokio::sync::broadcast::Receiver<Result<T, E1>>>,
|
||||
impl Future<Output = ()> + Send + 'static,
|
||||
) {
|
||||
#[cfg(debug_assertions)]
|
||||
match &**g {
|
||||
Some(MaybeDone::Pending(weak)) => {
|
||||
assert!(
|
||||
weak.upgrade().is_none(),
|
||||
"when starting a restart, should no longer have an upgradeable channel"
|
||||
);
|
||||
}
|
||||
Some(MaybeDone::Done(Err(err))) => {
|
||||
assert!(
|
||||
!err.is_permanent(),
|
||||
"when restarting, the err must be transient"
|
||||
);
|
||||
}
|
||||
Some(MaybeDone::Done(Ok(_))) => {
|
||||
panic!("unexpected restart after a completion on MaybeDone");
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
|
||||
self.make_run_and_complete_any(fut, g)
|
||||
}
|
||||
|
||||
/// Oneshot alike as in it's a future which will be consumed by an `await`.
|
||||
///
|
||||
/// Otherwise the caller might think it's beneficial or reasonable to poll the channel multiple
|
||||
/// times.
|
||||
async fn make_oneshot_alike_receiver<E2>(
|
||||
mut rx: tokio::sync::broadcast::Receiver<Result<T, E1>>,
|
||||
) -> Result<T, E2>
|
||||
where
|
||||
E2: From<E1> + From<RetriedTaskPanicked>,
|
||||
{
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(Ok(t)) => Ok(t),
|
||||
Ok(Err(e)) => Err(E2::from(e)),
|
||||
Err(RecvError::Closed | RecvError::Lagged(_)) => {
|
||||
// lagged doesn't mean anything with 1 send, but whatever, handle it the same
|
||||
// this case should only ever happen if a panick happened in the `fut`.
|
||||
Err(E2::from(RetriedTaskPanicked))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> SharedRetryable<V>
|
||||
where
|
||||
V: std::fmt::Debug + Clone + Send + 'static,
|
||||
{
|
||||
/// Attempt to run once a spawned future to completion.
|
||||
///
|
||||
/// Any previous attempt which panicked will be retried, but the `RetriedTaskPanicked` will be
|
||||
/// returned when the most recent attempt panicked.
|
||||
#[cfg(test)]
|
||||
pub async fn attempt_spawn(
|
||||
&self,
|
||||
attempt_with: impl MakeFuture<Output = V>,
|
||||
) -> Result<V, RetriedTaskPanicked> {
|
||||
let (rx, maybe_fut) = {
|
||||
let mut g = self.inner.lock().await;
|
||||
|
||||
let maybe_rx = match g.as_ref() {
|
||||
Some(MaybeDone::Done(v)) => return Ok(v.to_owned()),
|
||||
Some(MaybeDone::Pending(weak)) => {
|
||||
// see comment in try_restart
|
||||
weak.upgrade()
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let (strong, maybe_fut) = match maybe_rx {
|
||||
Some(strong) => (strong, None),
|
||||
None => {
|
||||
let fut = attempt_with.make_future();
|
||||
|
||||
let (strong, fut) = self.make_run_and_complete_any(fut, &mut g);
|
||||
(strong, Some(fut))
|
||||
}
|
||||
};
|
||||
|
||||
// see decide_to_retry_or_join for important notes
|
||||
let rx = strong.resubscribe();
|
||||
drop(strong);
|
||||
(rx, maybe_fut)
|
||||
};
|
||||
|
||||
if let Some(fut) = maybe_fut {
|
||||
// this is a top level function, need to spawn directly
|
||||
// from pageserver one wouldn't use this but more piecewise functions
|
||||
tokio::spawn(fut);
|
||||
}
|
||||
|
||||
let recv = Self::make_oneshot_alike_receiver_any(rx);
|
||||
|
||||
recv.await
|
||||
}
|
||||
|
||||
/// Configure a new attempt, but leave spawning it to the caller.
|
||||
///
|
||||
/// Forgetting the returned future is outside of scope of any correctness guarantees; all of
|
||||
/// the waiters will then be deadlocked, and the MaybeDone will forever be pending. Dropping
|
||||
/// and not running the future will then require a new attempt.
|
||||
///
|
||||
/// Also returns an `Arc<Receiver<V>>` which is valid until the attempt completes.
|
||||
fn make_run_and_complete_any(
|
||||
&self,
|
||||
fut: impl Future<Output = V> + Send + 'static,
|
||||
g: &mut tokio::sync::MutexGuard<'_, Option<MaybeDone<V>>>,
|
||||
) -> (
|
||||
Arc<tokio::sync::broadcast::Receiver<V>>,
|
||||
impl Future<Output = ()> + Send + 'static,
|
||||
) {
|
||||
let (tx, rx) = tokio::sync::broadcast::channel(1);
|
||||
let strong = Arc::new(rx);
|
||||
|
||||
**g = Some(MaybeDone::Pending(Arc::downgrade(&strong)));
|
||||
|
||||
let retry = {
|
||||
let strong = strong.clone();
|
||||
self.clone().run_and_complete(fut, tx, strong)
|
||||
};
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
match &**g {
|
||||
Some(MaybeDone::Pending(weak)) => {
|
||||
let rx = weak.upgrade().expect("holding the weak and strong locally");
|
||||
assert!(Arc::ptr_eq(&strong, &rx));
|
||||
}
|
||||
_ => unreachable!("MaybeDone::pending must be set after spawn_and_run_complete_any"),
|
||||
}
|
||||
|
||||
(strong, retry)
|
||||
}
|
||||
|
||||
/// Run the actual attempt, and communicate the response via both:
|
||||
/// - setting the `MaybeDone::Done`
|
||||
/// - the broadcast channel
|
||||
async fn run_and_complete(
|
||||
self,
|
||||
fut: impl Future<Output = V>,
|
||||
tx: tokio::sync::broadcast::Sender<V>,
|
||||
strong: Arc<tokio::sync::broadcast::Receiver<V>>,
|
||||
) {
|
||||
let res = fut.await;
|
||||
|
||||
{
|
||||
let mut g = self.inner.lock().await;
|
||||
MaybeDone::complete(&mut *g, &strong, res.clone());
|
||||
|
||||
// make the weak un-upgradeable by dropping the final alive
|
||||
// reference to it. it is final Arc because the Arc never escapes
|
||||
// the critical section in `decide_to_retry_or_join` or `attempt_spawn`.
|
||||
Arc::try_unwrap(strong).expect("expected this to be the only Arc<Receiver<V>>");
|
||||
}
|
||||
|
||||
// now no one can get the Pending(weak) value to upgrade and they only see
|
||||
// the Done(res).
|
||||
//
|
||||
// send the result value to listeners, if any
|
||||
drop(tx.send(res));
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn make_oneshot_alike_receiver_any(
|
||||
mut rx: tokio::sync::broadcast::Receiver<V>,
|
||||
) -> Result<V, RetriedTaskPanicked> {
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(t) => Ok(t),
|
||||
Err(RecvError::Closed | RecvError::Lagged(_)) => {
|
||||
// lagged doesn't mean anything with 1 send, but whatever, handle it the same
|
||||
// this case should only ever happen if a panick happened in the `fut`.
|
||||
Err(RetriedTaskPanicked)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// MaybeDone handles synchronization for multiple requests and the single actual task.
|
||||
///
|
||||
/// If request handlers witness `Pending` which they are able to upgrade, they are guaranteed a
|
||||
/// useful `recv().await`, where useful means "value" or "disconnect" arrives. If upgrade fails,
|
||||
/// this means that "disconnect" has happened in the past.
|
||||
///
|
||||
/// On successful execution the one executing task will set this to `Done` variant, with the actual
|
||||
/// resulting value.
|
||||
#[derive(Debug)]
|
||||
pub enum MaybeDone<V> {
|
||||
Pending(std::sync::Weak<tokio::sync::broadcast::Receiver<V>>),
|
||||
Done(V),
|
||||
}
|
||||
|
||||
impl<V: std::fmt::Debug> MaybeDone<V> {
|
||||
fn complete(
|
||||
this: &mut Option<MaybeDone<V>>,
|
||||
_strong: &Arc<tokio::sync::broadcast::Receiver<V>>,
|
||||
outcome: V,
|
||||
) {
|
||||
#[cfg(debug_assertions)]
|
||||
match this {
|
||||
Some(MaybeDone::Pending(weak)) => {
|
||||
let same = weak
|
||||
.upgrade()
|
||||
// we don't yet have Receiver::same_channel
|
||||
.map(|rx| Arc::ptr_eq(_strong, &rx))
|
||||
.unwrap_or(false);
|
||||
assert!(same, "different channel had been replaced or dropped");
|
||||
}
|
||||
other => panic!("unexpected MaybeDone: {other:?}"),
|
||||
}
|
||||
|
||||
*this = Some(MaybeDone::Done(outcome));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{RetriedTaskPanicked, Retryable, SharedRetryable};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum OuterError {
|
||||
AttemptPanicked,
|
||||
Unlucky,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum InnerError {
|
||||
Unlucky,
|
||||
}
|
||||
|
||||
impl Retryable for InnerError {
|
||||
fn is_permanent(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl From<InnerError> for OuterError {
|
||||
fn from(_: InnerError) -> Self {
|
||||
OuterError::Unlucky
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RetriedTaskPanicked> for OuterError {
|
||||
fn from(_: RetriedTaskPanicked) -> Self {
|
||||
OuterError::AttemptPanicked
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn restartable_until_permanent() {
|
||||
let shr = SharedRetryable::<Result<u8, InnerError>>::default();
|
||||
|
||||
let res = shr
|
||||
.try_restart_spawn(|| async move { panic!("really unlucky") })
|
||||
.await;
|
||||
|
||||
assert!(matches!(res, Err(OuterError::AttemptPanicked)));
|
||||
|
||||
let res = shr
|
||||
.try_restart_spawn(|| async move { Err(InnerError::Unlucky) })
|
||||
.await;
|
||||
|
||||
assert!(matches!(res, Err(OuterError::Unlucky)));
|
||||
|
||||
let res = shr.try_restart_spawn(|| async move { Ok(42) }).await;
|
||||
|
||||
assert!(matches!(res, Ok::<u8, OuterError>(42)));
|
||||
|
||||
let res = shr
|
||||
.try_restart_spawn(|| async move { panic!("rerun should clone Ok(42)") })
|
||||
.await;
|
||||
|
||||
assert!(matches!(res, Ok::<u8, OuterError>(42)));
|
||||
}
|
||||
|
||||
/// Demonstration of the SharedRetryable::attempt
|
||||
#[tokio::test]
|
||||
async fn attemptable_until_no_panic() {
|
||||
let shr = SharedRetryable::<u8>::default();
|
||||
|
||||
let res = shr
|
||||
.attempt_spawn(|| async move { panic!("should not interfere") })
|
||||
.await;
|
||||
|
||||
assert!(matches!(res, Err(RetriedTaskPanicked)), "{res:?}");
|
||||
|
||||
let res = shr.attempt_spawn(|| async move { 42 }).await;
|
||||
|
||||
assert_eq!(res, Ok(42));
|
||||
|
||||
let res = shr
|
||||
.attempt_spawn(|| async move { panic!("should not be called") })
|
||||
.await;
|
||||
|
||||
assert_eq!(res, Ok(42));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancelling_spawner_is_fine() {
|
||||
let shr = SharedRetryable::<Result<u8, InnerError>>::default();
|
||||
|
||||
let (recv1, maybe_fut) = shr
|
||||
.try_restart(|| async move { panic!("should not have been called") })
|
||||
.await;
|
||||
let should_be_spawned = maybe_fut.unwrap();
|
||||
|
||||
let (recv2, maybe_fut) = shr
|
||||
.try_restart(|| async move {
|
||||
panic!("should never be called because waiting on should_be_spawned")
|
||||
})
|
||||
.await;
|
||||
assert!(
|
||||
matches!(maybe_fut, None),
|
||||
"only the first one should had created the future"
|
||||
);
|
||||
|
||||
let mut recv1 = std::pin::pin!(recv1);
|
||||
let mut recv2 = std::pin::pin!(recv2);
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(std::time::Duration::from_millis(100)) => {},
|
||||
_ = &mut recv1 => unreachable!("should not have completed because should_be_spawned not spawned"),
|
||||
_ = &mut recv2 => unreachable!("should not have completed because should_be_spawned not spawned"),
|
||||
}
|
||||
|
||||
drop(should_be_spawned);
|
||||
|
||||
let res = recv1.await;
|
||||
assert!(matches!(res, Err(OuterError::AttemptPanicked)), "{res:?}");
|
||||
|
||||
let res = recv2.await;
|
||||
assert!(matches!(res, Err(OuterError::AttemptPanicked)), "{res:?}");
|
||||
|
||||
// but we can still reach a terminal state if the api is not misused or the
|
||||
// should_be_spawned winner is not cancelled
|
||||
|
||||
let recv1 = shr.try_restart_spawn::<OuterError>(|| async move { Ok(42) });
|
||||
let recv2 = shr.try_restart_spawn::<OuterError>(|| async move { Ok(43) });
|
||||
|
||||
assert_eq!(recv1.await.unwrap(), 42);
|
||||
assert_eq!(recv2.await.unwrap(), 42, "43 should never be returned");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_example() {
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum OneLevelError {
|
||||
TaskPanicked,
|
||||
}
|
||||
|
||||
impl Retryable for OneLevelError {
|
||||
fn is_permanent(&self) -> bool {
|
||||
// for a single level errors, this wording is weird
|
||||
!matches!(self, OneLevelError::TaskPanicked)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RetriedTaskPanicked> for OneLevelError {
|
||||
fn from(_: RetriedTaskPanicked) -> Self {
|
||||
OneLevelError::TaskPanicked
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct Service(SharedRetryable<Result<u8, OneLevelError>>);
|
||||
|
||||
impl Service {
|
||||
async fn work(
|
||||
&self,
|
||||
completions: Arc<std::sync::atomic::AtomicUsize>,
|
||||
) -> Result<u8, OneLevelError> {
|
||||
self.0
|
||||
.try_restart_spawn(|| async move {
|
||||
// give time to cancel some of the tasks
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
completions.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
Self::work_once().await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn work_once() -> Result<u8, OneLevelError> {
|
||||
Ok(42)
|
||||
}
|
||||
}
|
||||
|
||||
let svc = Service::default();
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(10 + 1));
|
||||
let completions = Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
|
||||
let handles = (0..10)
|
||||
.map(|_| {
|
||||
js.spawn({
|
||||
let svc = svc.clone();
|
||||
let barrier = barrier.clone();
|
||||
let completions = completions.clone();
|
||||
async move {
|
||||
// make sure all tasks are ready to start at the same time
|
||||
barrier.wait().await;
|
||||
// after successfully starting the work, any of the futures could get cancelled
|
||||
svc.work(completions).await
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
barrier.wait().await;
|
||||
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
|
||||
handles[5].abort();
|
||||
|
||||
let mut cancellations = 0;
|
||||
|
||||
while let Some(res) = js.join_next().await {
|
||||
// all complete with the same result
|
||||
match res {
|
||||
Ok(res) => assert_eq!(res.unwrap(), 42),
|
||||
Err(je) => {
|
||||
// except for the one task we cancelled; it's cancelling
|
||||
// does not interfere with the result
|
||||
assert!(je.is_cancelled());
|
||||
cancellations += 1;
|
||||
assert_eq!(cancellations, 1, "only 6th task was aborted");
|
||||
// however we cannot assert that everytime we get to cancel the 6th task
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// there will be at most one terminal completion
|
||||
assert_eq!(completions.load(std::sync::atomic::Ordering::Relaxed), 1);
|
||||
}
|
||||
}
|
||||
@@ -75,8 +75,6 @@ enum-map.workspace = true
|
||||
enumset.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
# feature "send_guard" markes it so that lock guards are Send
|
||||
parking_lot = { workspace = true, default-features = false, features = [ "send_guard" ] }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
|
||||
@@ -272,6 +272,9 @@ pub enum TaskKind {
|
||||
|
||||
#[cfg(test)]
|
||||
UnitTest,
|
||||
|
||||
/// Task which is the only task to delete this particular timeline
|
||||
DeleteTimeline,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -39,7 +39,8 @@ use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::MutexGuard;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::config::TenantConf;
|
||||
@@ -132,7 +133,7 @@ pub struct Tenant {
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
|
||||
tenant_id: TenantId,
|
||||
timelines: parking_lot::Mutex<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
// `timelines` mutex during all GC iteration
|
||||
@@ -183,7 +184,7 @@ impl UninitializedTimeline<'_> {
|
||||
/// The new timeline is initialized in Active state, and its background jobs are
|
||||
/// started
|
||||
pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
|
||||
let mut timelines = self.owning_tenant.timelines.lock();
|
||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||
self.initialize_with_lock(ctx, &mut timelines, true, true)
|
||||
}
|
||||
|
||||
@@ -274,7 +275,7 @@ impl UninitializedTimeline<'_> {
|
||||
|
||||
// Initialize without loading the layer map. We started with an empty layer map, and already
|
||||
// updated it for the layers that we created during the import.
|
||||
let mut timelines = self.owning_tenant.timelines.lock();
|
||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||
self.initialize_with_lock(ctx, &mut timelines, false, true)
|
||||
}
|
||||
|
||||
@@ -455,6 +456,50 @@ pub enum DeleteTimelineError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error, Clone)]
|
||||
enum InnerDeleteTimelineError {
|
||||
#[error("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")]
|
||||
QueueUninitialized,
|
||||
|
||||
#[error("failpoint: {0}")]
|
||||
Failpoint(&'static str),
|
||||
|
||||
#[error("failed to remove local timeline directory")]
|
||||
FailedToRemoveLocalTimelineDirectory,
|
||||
|
||||
#[error("index_part.json upload failed")]
|
||||
UploadFailed,
|
||||
|
||||
#[error("the deleted timeline grew branches while deleting it; tenant should now be broken")]
|
||||
DeletedGrewChildren,
|
||||
}
|
||||
|
||||
impl utils::shared_retryable::Retryable for InnerDeleteTimelineError {
|
||||
fn is_permanent(&self) -> bool {
|
||||
use InnerDeleteTimelineError::*;
|
||||
|
||||
match self {
|
||||
QueueUninitialized => false,
|
||||
Failpoint(_) => false,
|
||||
FailedToRemoveLocalTimelineDirectory => false,
|
||||
UploadFailed => false,
|
||||
DeletedGrewChildren => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<InnerDeleteTimelineError> for DeleteTimelineError {
|
||||
fn from(value: InnerDeleteTimelineError) -> Self {
|
||||
DeleteTimelineError::Other(anyhow::Error::new(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<utils::shared_retryable::RetriedTaskPanicked> for DeleteTimelineError {
|
||||
fn from(_: utils::shared_retryable::RetriedTaskPanicked) -> Self {
|
||||
DeleteTimelineError::Other(anyhow::anyhow!("deleting timeline failed, please retry"))
|
||||
}
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
@@ -493,7 +538,7 @@ impl Tenant {
|
||||
|
||||
let timeline = {
|
||||
// avoiding holding it across awaits
|
||||
let mut timelines_accessor = self.timelines.lock();
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
if timelines_accessor.contains_key(&timeline_id) {
|
||||
anyhow::bail!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
@@ -560,6 +605,7 @@ impl Tenant {
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
@@ -804,7 +850,7 @@ impl Tenant {
|
||||
.context("Failed to create new timeline directory")?;
|
||||
|
||||
let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
|
||||
|| {
|
||||
anyhow::anyhow!(
|
||||
@@ -1139,7 +1185,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
active_only: bool,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timelines_accessor = self.timelines.lock();
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let timeline = timelines_accessor.get(&timeline_id).with_context(|| {
|
||||
format!("Timeline {}/{} was not found", self.tenant_id, timeline_id)
|
||||
})?;
|
||||
@@ -1159,7 +1205,12 @@ impl Tenant {
|
||||
/// Lists timelines the tenant contains.
|
||||
/// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
|
||||
pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
|
||||
self.timelines.lock().values().map(Arc::clone).collect()
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.map(Arc::clone)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||
@@ -1177,7 +1228,7 @@ impl Tenant {
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
|
||||
drop(timelines);
|
||||
|
||||
@@ -1309,7 +1360,7 @@ impl Tenant {
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines_to_compact = {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
|
||||
@@ -1338,7 +1389,7 @@ impl Tenant {
|
||||
// flushing. We don't want to block everything else while the
|
||||
// flushing is performed.
|
||||
let timelines_to_flush = {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(_id, timeline)| Arc::clone(timeline))
|
||||
@@ -1354,7 +1405,7 @@ impl Tenant {
|
||||
|
||||
/// Removes timeline-related in-memory data
|
||||
pub async fn delete_timeline(
|
||||
&self,
|
||||
self: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
@@ -1363,7 +1414,7 @@ impl Tenant {
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock();
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
@@ -1387,162 +1438,196 @@ impl Tenant {
|
||||
timeline
|
||||
};
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to
|
||||
// shut down.
|
||||
let span = tracing::Span::current();
|
||||
|
||||
// if we have concurrent requests, we will only execute one version of following future;
|
||||
// initially it did not have any means to be cancelled.
|
||||
//
|
||||
// NB: If you call delete_timeline multiple times concurrently, they will
|
||||
// all go through the motions here. Make sure the code here is idempotent,
|
||||
// and don't error out if some of the shutdown tasks have already been
|
||||
// completed!
|
||||
// NOTE: Unlike "the usual" futures, this one will log any errors instead of just propagating
|
||||
// them to the caller. This is because this one future produces a value, which will need to
|
||||
// be cloneable to everyone interested, and normal `std::error::Error` are not clonable.
|
||||
// Also, it wouldn't make sense to log the same failure multiple times, it would look like
|
||||
// multiple failures to anyone reading the logs.
|
||||
let factory = || {
|
||||
let tenant = self.clone();
|
||||
let tenant_id = self.tenant_id;
|
||||
let timeline = timeline.clone();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
timeline.walreceiver.stop().await;
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
async move {
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to
|
||||
// shut down.
|
||||
//
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
timeline.walreceiver.stop().await;
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
warn!("failed to stop RemoteTimelineClient due to uninitialized queue");
|
||||
return Err(InnerDeleteTimelineError::QueueUninitialized);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
|
||||
|
||||
// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
// during attach or pageserver restart.
|
||||
// See comment in persist_index_part_with_deleted_flag.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
match remote_client.persist_index_part_with_deleted_flag().await {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
// Bail out otherwise
|
||||
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
|
||||
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
|
||||
warn!("upload failed: {e}");
|
||||
return Err(InnerDeleteTimelineError::UploadFailed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||
//
|
||||
// This lock prevents multiple concurrent delete_timeline calls from
|
||||
// stepping on each other's toes, while deleting the files. It also
|
||||
// prevents GC or compaction from running at the same time.
|
||||
//
|
||||
// Note that there are still other race conditions between
|
||||
// GC, compaction and timeline deletion. GC task doesn't
|
||||
// register itself properly with the timeline it's
|
||||
// operating on. See
|
||||
// https://github.com/neondatabase/neon/issues/2671
|
||||
//
|
||||
// No timeout here, GC & Compaction should be responsive to the
|
||||
// `TimelineState::Stopping` change.
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
let _layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// by us earlier.
|
||||
|
||||
let local_timeline_directory =
|
||||
tenant.conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(InnerDeleteTimelineError::Failpoint(
|
||||
"failpoint: timeline-delete-before-rm",
|
||||
))
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we tolerate that we're not crash-safe here.
|
||||
// The timeline may come up Active but with missing layer files, in such setups.
|
||||
// See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
|
||||
match std::fs::remove_dir_all(&local_timeline_directory) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// It can also happen if we race with tenant detach, because,
|
||||
// it doesn't grab the layer_removal_cs lock.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
let tenant_state = tenant.current_state();
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
?tenant_state,
|
||||
"timeline directory not found, proceeding anyway"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to remove local timeline directory {}: {e:#}",
|
||||
local_timeline_directory.display()
|
||||
);
|
||||
return Err(
|
||||
InnerDeleteTimelineError::FailedToRemoveLocalTimelineDirectory,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(InnerDeleteTimelineError::Failpoint(
|
||||
"timeline-delete-after-rm",
|
||||
))
|
||||
});
|
||||
|
||||
// Remove the timeline from the map or poison it if we've grown children.
|
||||
let removed_timeline =
|
||||
std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines.iter().any(|(_, entry)| {
|
||||
entry.get_ancestor_timeline_id() == Some(timeline_id)
|
||||
});
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
timelines.remove(&timeline_id)
|
||||
}));
|
||||
|
||||
match removed_timeline {
|
||||
Ok(Some(_)) => {}
|
||||
Ok(None) => {
|
||||
// with SharedRetryable this should no longer happen
|
||||
warn!("no other task should had dropped the Timeline");
|
||||
}
|
||||
Err(_panic) => return Err(InnerDeleteTimelineError::DeletedGrewChildren),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
// execute in the *winner's* span so we will capture the request id etc.
|
||||
.instrument(span)
|
||||
};
|
||||
|
||||
let (recv, maybe_fut) = timeline.delete_self.try_restart(factory).await;
|
||||
|
||||
if let Some(fut) = maybe_fut {
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::DeleteTimeline,
|
||||
Some(self.tenant_id()),
|
||||
None,
|
||||
&format!("delete_timeline {}", timeline.timeline_id),
|
||||
false,
|
||||
async move {
|
||||
fut.await;
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
|
||||
|
||||
// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
// during attach or pageserver restart.
|
||||
// See comment in persist_index_part_with_deleted_flag.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
match remote_client.persist_index_part_with_deleted_flag().await {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
// Bail out otherwise
|
||||
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
|
||||
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||
//
|
||||
// This lock prevents multiple concurrent delete_timeline calls from
|
||||
// stepping on each other's toes, while deleting the files. It also
|
||||
// prevents GC or compaction from running at the same time.
|
||||
//
|
||||
// Note that there are still other race conditions between
|
||||
// GC, compaction and timeline deletion. GC task doesn't
|
||||
// register itself properly with the timeline it's
|
||||
// operating on. See
|
||||
// https://github.com/neondatabase/neon/issues/2671
|
||||
//
|
||||
// No timeout here, GC & Compaction should be responsive to the
|
||||
// `TimelineState::Stopping` change.
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// by the caller.
|
||||
|
||||
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we tolerate that we're not crash-safe here.
|
||||
// The timeline may come up Active but with missing layer files, in such setups.
|
||||
// See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
|
||||
match std::fs::remove_dir_all(&local_timeline_directory) {
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// It can also happen if we race with tenant detach, because,
|
||||
// it doesn't grab the layer_removal_cs lock.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
let tenant_state = self.current_state();
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
?tenant_state,
|
||||
"timeline directory not found, proceeding anyway"
|
||||
);
|
||||
// continue with the rest of the deletion
|
||||
}
|
||||
res => res.with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?,
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
drop(layer_removal_guard);
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = self.timelines.lock();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
let removed_timeline = timelines.remove(&timeline_id);
|
||||
if removed_timeline.is_none() {
|
||||
// This can legitimately happen if there's a concurrent call to this function.
|
||||
// T1 T2
|
||||
// lock
|
||||
// unlock
|
||||
// lock
|
||||
// unlock
|
||||
// remove files
|
||||
// lock
|
||||
// remove from map
|
||||
// unlock
|
||||
// return
|
||||
// remove files
|
||||
// lock
|
||||
// remove from map observes empty map
|
||||
// unlock
|
||||
// return
|
||||
debug!("concurrent call to this function won the race");
|
||||
}
|
||||
drop(timelines);
|
||||
|
||||
Ok(())
|
||||
recv.await
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
@@ -1578,7 +1663,7 @@ impl Tenant {
|
||||
|
||||
debug!(tenant_id = %self.tenant_id, "Activating tenant");
|
||||
|
||||
let timelines_accessor = self.timelines.lock();
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
.values()
|
||||
.filter(|timeline| timeline.current_state() != TimelineState::Broken);
|
||||
@@ -1645,7 +1730,7 @@ impl Tenant {
|
||||
// might be created after this. That's harmless, as the Timelines
|
||||
// won't be accessible to anyone, when the Tenant is in Stopping
|
||||
// state.
|
||||
let timelines_accessor = self.timelines.lock();
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
.values()
|
||||
.filter(|timeline| timeline.current_state() != TimelineState::Broken);
|
||||
@@ -1938,7 +2023,7 @@ impl Tenant {
|
||||
// activation times.
|
||||
loading_started_at: Instant::now(),
|
||||
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
||||
timelines: parking_lot::Mutex::new(HashMap::new()),
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
@@ -2166,7 +2251,7 @@ impl Tenant {
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let mut all_branchpoints = BTreeSet::new();
|
||||
let timeline_ids = {
|
||||
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||
@@ -2266,7 +2351,7 @@ impl Tenant {
|
||||
// Create a placeholder for the new branch. This will error
|
||||
// out if the new timeline ID is already in use.
|
||||
let timeline_uninit_mark = {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
self.create_timeline_uninit_mark(dst_id, &timelines)?
|
||||
};
|
||||
|
||||
@@ -2331,7 +2416,7 @@ impl Tenant {
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
let mut timelines = self.timelines.lock();
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let new_timeline = self
|
||||
.prepare_timeline(
|
||||
dst_id,
|
||||
@@ -2368,7 +2453,7 @@ impl Tenant {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_uninit_mark = {
|
||||
let timelines = self.timelines.lock();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
self.create_timeline_uninit_mark(timeline_id, &timelines)?
|
||||
};
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
@@ -2454,7 +2539,7 @@ impl Tenant {
|
||||
// Initialize the timeline without loading the layer map, because we already updated the layer
|
||||
// map above, when we imported the datadir.
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock();
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
|
||||
};
|
||||
|
||||
@@ -2501,7 +2586,8 @@ impl Tenant {
|
||||
) {
|
||||
Ok(new_timeline) => {
|
||||
if init_layers {
|
||||
new_timeline.layers.write().next_open_layer_at = Some(new_timeline.initdb_lsn);
|
||||
new_timeline.layers.write().unwrap().next_open_layer_at =
|
||||
Some(new_timeline.initdb_lsn);
|
||||
}
|
||||
debug!(
|
||||
"Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"
|
||||
@@ -2556,7 +2642,7 @@ impl Tenant {
|
||||
fn create_timeline_uninit_mark(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
timelines: &parking_lot::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
) -> anyhow::Result<TimelineUninitMark> {
|
||||
let tenant_id = self.tenant_id;
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
@@ -336,13 +335,6 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type GetValueReconstructFuture = Pin<
|
||||
Box<
|
||||
dyn Send
|
||||
+ std::future::Future<Output = Result<(ValueReconstructState, ValueReconstructResult)>>,
|
||||
>,
|
||||
>;
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`].
|
||||
///
|
||||
@@ -380,12 +372,12 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture;
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
@@ -494,12 +486,12 @@ impl Layer for LayerDescriptor {
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture {
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -56,8 +56,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
DeltaFileName, GetValueReconstructFuture, Layer, LayerAccessStats, LayerAccessStatsReset,
|
||||
LayerFileName, LayerIter, LayerKeyIter, PathOrConf,
|
||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
|
||||
LayerKeyIter, PathOrConf,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -318,94 +318,89 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture {
|
||||
Box::pin(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.key_range.contains(&key));
|
||||
ensure!(self.key_range.contains(&key));
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
})?;
|
||||
!blob_ref.will_init()
|
||||
})?;
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
}
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
})
|
||||
}
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -53,7 +53,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::{ImageFileName, LayerFileName};
|
||||
use super::{GetValueReconstructFuture, Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
|
||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -197,45 +197,38 @@ impl Layer for ImageLayer {
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture {
|
||||
Box::pin(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Missing))
|
||||
}
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
})
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
@@ -27,9 +27,9 @@ use utils::{
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, GetValueReconstructFuture, Layer};
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
thread_local! {
|
||||
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
|
||||
@@ -191,60 +191,52 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture {
|
||||
Box::pin(async move {
|
||||
// The in-memory layer isn't actually in-memory. It uses EphemeralFile.
|
||||
// So, this does do IO.
|
||||
tokio::task::spawn_blocking(move || {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok((reconstruct_state, ValueReconstructResult::Complete));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
}
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
})
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use anyhow::{bail, Result};
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
@@ -21,8 +21,8 @@ use utils::{
|
||||
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
use super::image_layer::ImageLayer;
|
||||
use super::{
|
||||
DeltaLayer, GetValueReconstructFuture, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, LayerResidenceStatus, PersistentLayer,
|
||||
DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -83,18 +83,16 @@ impl Layer for RemoteLayer {
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> GetValueReconstructFuture {
|
||||
Box::pin(async move {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
);
|
||||
})
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
);
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
|
||||
@@ -121,7 +121,7 @@ pub struct Timeline {
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
pub(super) layers: parking_lot::RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -227,6 +227,9 @@ pub struct Timeline {
|
||||
state: watch::Sender<TimelineState>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
pub(super) delete_self:
|
||||
utils::shared_retryable::SharedRetryable<Result<(), super::InnerDeleteTimelineError>>,
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
@@ -504,13 +507,12 @@ impl Timeline {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let reconstruct_state = ValueReconstructState {
|
||||
let mut reconstruct_state = ValueReconstructState {
|
||||
records: Vec::new(),
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
let reconstruct_state = self
|
||||
.get_reconstruct_data(key, lsn, reconstruct_state, ctx)
|
||||
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
|
||||
self.metrics
|
||||
@@ -548,7 +550,7 @@ impl Timeline {
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub fn layer_size_sum(&self) -> u64 {
|
||||
let layer_map = self.layers.read();
|
||||
let layer_map = self.layers.read().unwrap();
|
||||
let mut size = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size();
|
||||
@@ -848,7 +850,7 @@ impl Timeline {
|
||||
/// safekeepers to regard pageserver as caught up and suspend activity.
|
||||
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let open_layer_size = open_layer.size()?;
|
||||
drop(layers);
|
||||
@@ -928,7 +930,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||
let layer_map = self.layers.read();
|
||||
let layer_map = self.layers.read().unwrap();
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
if let Some(open_layer) = &layer_map.open_layer {
|
||||
in_memory_layers.push(open_layer.info());
|
||||
@@ -1048,7 +1050,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// start the batch update
|
||||
let mut layer_map = self.layers.write();
|
||||
let mut layer_map = self.layers.write().unwrap();
|
||||
let mut batch_updates = layer_map.batch_update();
|
||||
|
||||
let mut results = Vec::with_capacity(layers_to_evict.len());
|
||||
@@ -1312,7 +1314,7 @@ impl Timeline {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: parking_lot::RwLock::new(LayerMap::default()),
|
||||
layers: RwLock::new(LayerMap::default()),
|
||||
|
||||
walredo_mgr,
|
||||
walreceiver,
|
||||
@@ -1380,6 +1382,8 @@ impl Timeline {
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
|
||||
delete_self: utils::shared_retryable::SharedRetryable::default(),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
@@ -1453,7 +1457,7 @@ impl Timeline {
|
||||
/// Returns all timeline-related files that were found and loaded.
|
||||
///
|
||||
pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let mut num_layers = 0;
|
||||
|
||||
@@ -1582,7 +1586,7 @@ impl Timeline {
|
||||
|
||||
// We're holding a layer map lock for a while but this
|
||||
// method is only called during init so it's fine.
|
||||
let mut layer_map = self.layers.write();
|
||||
let mut layer_map = self.layers.write().unwrap();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for remote_layer_name in &index_part.timeline_layers {
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
@@ -1735,6 +1739,7 @@ impl Timeline {
|
||||
let local_layers = self
|
||||
.layers
|
||||
.read()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.map(|l| (l.filename(), l))
|
||||
.collect::<HashMap<_, _>>();
|
||||
@@ -2060,7 +2065,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
|
||||
for historic_layer in self.layers.read().iter_historic_layers() {
|
||||
for historic_layer in self.layers.read().unwrap().iter_historic_layers() {
|
||||
let historic_layer_name = historic_layer.filename().file_name();
|
||||
if layer_file_name == historic_layer_name {
|
||||
return Some(historic_layer);
|
||||
@@ -2143,31 +2148,13 @@ impl Timeline {
|
||||
///
|
||||
/// This function takes the current timeline's locked LayerMap as an argument,
|
||||
/// so callers can avoid potential race conditions.
|
||||
///
|
||||
// TODO: find a way to not hold the Timeline::layers lock during get_value_reconstruct_data calls.
|
||||
//
|
||||
// Since these calls do local disk IO, they'll be reasonably fast, until come disk IOPS bound.
|
||||
// We have lots of headroom on current pageservers, so, it's going to be fine for now.
|
||||
//
|
||||
// We can't use tokio::sync::RwLock that easily because its guard is not Send, but,
|
||||
// many tasks that access Timeline::layers run inside task_mgr tasks, which are required
|
||||
// to be Send. It has been tried in origin/problame/asyncify-get-reconstruct-data--tokio-sync.
|
||||
//
|
||||
// The solution will probably be to have an immutable + multi-versioned layer map, allowing
|
||||
// us to grab a snapshot of the layer map once and execute this function on the snapshot.
|
||||
//
|
||||
// Or, we could invest time to figure out whether we can drop the layer map lock after
|
||||
// we grabbed the layer, do the IO, re-aquire, and continue the traversal.
|
||||
//
|
||||
// (Why is this allow() not inside the function? Because clippy doesn't respect it then).
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
async fn get_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructState, PageReconstructError> {
|
||||
) -> Result<(), PageReconstructError> {
|
||||
// Start from the current timeline.
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
@@ -2194,12 +2181,12 @@ impl Timeline {
|
||||
// The function should have updated 'state'
|
||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||
match result {
|
||||
ValueReconstructResult::Complete => return Ok(reconstruct_state),
|
||||
ValueReconstructResult::Complete => return Ok(()),
|
||||
ValueReconstructResult::Continue => {
|
||||
// If we reached an earlier cached page image, we're done.
|
||||
if cont_lsn == cached_lsn + 1 {
|
||||
self.metrics.materialized_page_cache_hit_counter.inc_by(1);
|
||||
return Ok(reconstruct_state);
|
||||
return Ok(());
|
||||
}
|
||||
if prev_lsn <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
@@ -2245,7 +2232,7 @@ impl Timeline {
|
||||
#[allow(clippy::never_loop)] // see comment at bottom of this loop
|
||||
'layer_map_search: loop {
|
||||
let remote_layer = {
|
||||
let layers = timeline.layers.read();
|
||||
let layers = timeline.layers.read().unwrap();
|
||||
|
||||
// Check the open and frozen in-memory layers first, in order from newest
|
||||
// to oldest.
|
||||
@@ -2256,19 +2243,13 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match Arc::clone(open_layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match open_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -2288,19 +2269,13 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match Arc::clone(frozen_layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match frozen_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -2328,19 +2303,13 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
result = match Arc::clone(&layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -2443,7 +2412,7 @@ impl Timeline {
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
@@ -2516,7 +2485,7 @@ impl Timeline {
|
||||
} else {
|
||||
Some(self.write_lock.lock().unwrap())
|
||||
};
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
@@ -2554,7 +2523,7 @@ impl Timeline {
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
let layer_to_flush = {
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
layers.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
@@ -2655,7 +2624,7 @@ impl Timeline {
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now.
|
||||
{
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let l = layers.frozen_layers.pop_front();
|
||||
|
||||
// Only one thread may call this function at a time (for this
|
||||
@@ -2773,7 +2742,7 @@ impl Timeline {
|
||||
|
||||
// Add it to the layer map
|
||||
let l = Arc::new(new_delta);
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut batch_updates = layers.batch_update();
|
||||
l.access_stats().record_residence_event(
|
||||
&batch_updates,
|
||||
@@ -2828,7 +2797,7 @@ impl Timeline {
|
||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_deltas = 0;
|
||||
|
||||
@@ -2971,7 +2940,7 @@ impl Timeline {
|
||||
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
for l in image_layers {
|
||||
@@ -3038,7 +3007,7 @@ impl Timeline {
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
drop(layers);
|
||||
|
||||
@@ -3161,7 +3130,7 @@ impl Timeline {
|
||||
// Determine N largest holes where N is number of compacted layers.
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read(); // Is'n it better to hold original layers lock till here?
|
||||
let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
|
||||
@@ -3398,7 +3367,7 @@ impl Timeline {
|
||||
.context("wait for layer upload ops to complete")?;
|
||||
}
|
||||
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||
for l in new_layers {
|
||||
@@ -3657,7 +3626,7 @@ impl Timeline {
|
||||
// 4. newer on-disk image layers cover the layer's whole key range
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut layers = self.layers.write();
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
|
||||
@@ -3940,7 +3909,7 @@ impl Timeline {
|
||||
|
||||
// Download complete. Replace the RemoteLayer with the corresponding
|
||||
// Delta- or ImageLayer in the layer map.
|
||||
let mut layers = self_clone.layers.write();
|
||||
let mut layers = self_clone.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
|
||||
{
|
||||
@@ -4098,7 +4067,7 @@ impl Timeline {
|
||||
) {
|
||||
let mut downloads = Vec::new();
|
||||
{
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.filter_map(|l| l.downcast_remote_layer())
|
||||
@@ -4201,7 +4170,7 @@ impl LocalLayerInfoForDiskUsageEviction {
|
||||
|
||||
impl Timeline {
|
||||
pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
let mut resident_layers = Vec::new();
|
||||
|
||||
@@ -178,7 +178,7 @@ impl Timeline {
|
||||
// We don't want to hold the layer map lock during eviction.
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let layers = self.layers.read();
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
if hist_layer.is_remote_layer() {
|
||||
|
||||
@@ -346,20 +346,31 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
|
||||
failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
ps_http.configure_failpoints((failpoint_name, "pause"))
|
||||
|
||||
def first_call(result_queue):
|
||||
def delete_timeline_call(name, result_queue, barrier):
|
||||
if barrier:
|
||||
barrier.wait()
|
||||
try:
|
||||
log.info("first call start")
|
||||
log.info(f"{name} call start")
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
|
||||
log.info("first call success")
|
||||
log.info(f"{name} call success")
|
||||
result_queue.put("success")
|
||||
except Exception:
|
||||
log.exception("first call failed")
|
||||
log.exception(f"{name} call failed")
|
||||
result_queue.put("failure, see log for stack trace")
|
||||
|
||||
first_call_result: queue.Queue[str] = queue.Queue()
|
||||
first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
|
||||
delete_results: queue.Queue[str] = queue.Queue()
|
||||
first_call_thread = threading.Thread(
|
||||
target=delete_timeline_call,
|
||||
args=(
|
||||
"1st",
|
||||
delete_results,
|
||||
None,
|
||||
),
|
||||
)
|
||||
first_call_thread.start()
|
||||
|
||||
second_call_thread = None
|
||||
|
||||
try:
|
||||
|
||||
def first_call_hit_failpoint():
|
||||
@@ -369,30 +380,42 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
|
||||
|
||||
wait_until(50, 0.1, first_call_hit_failpoint)
|
||||
|
||||
# make the second call and assert behavior
|
||||
log.info("second call start")
|
||||
error_msg_re = "another task is already setting the deleted_flag, started at"
|
||||
with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
|
||||
assert second_call_err.value.status_code == 500
|
||||
env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
|
||||
# the second call will try to transition the timeline into Stopping state as well
|
||||
barrier = threading.Barrier(2)
|
||||
second_call_thread = threading.Thread(
|
||||
target=delete_timeline_call,
|
||||
args=(
|
||||
"2nd",
|
||||
delete_results,
|
||||
barrier,
|
||||
),
|
||||
)
|
||||
second_call_thread.start()
|
||||
|
||||
barrier.wait()
|
||||
|
||||
# release the pause
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
# both should had succeeded: the second call will coalesce with the already-ongoing first call
|
||||
result = delete_results.get()
|
||||
assert result == "success"
|
||||
result = delete_results.get()
|
||||
assert result == "success"
|
||||
|
||||
# the second call will try to transition the timeline into Stopping state, but it's already in that state
|
||||
# (the transition to Stopping state is not part of the request coalescing, because Tenant and Timeline states are a mess already)
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
log.info("second call failed as expected")
|
||||
|
||||
# by now we know that the second call failed, let's ensure the first call will finish
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
result = first_call_result.get()
|
||||
assert result == "success"
|
||||
|
||||
finally:
|
||||
log.info("joining first call thread")
|
||||
log.info("joining 1st thread")
|
||||
# in any case, make sure the lifetime of the thread is bounded to this test
|
||||
first_call_thread.join()
|
||||
|
||||
if second_call_thread:
|
||||
log.info("joining 2nd thread")
|
||||
second_call_thread.join()
|
||||
|
||||
|
||||
def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
@@ -440,9 +463,16 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
# ok, retry without failpoint, it should succeed
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
# this should succeed
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
|
||||
# the second call will try to transition the timeline into Stopping state, but it's already in that state
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
try:
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
except PageserverApiException as e:
|
||||
if e.status_code != 404:
|
||||
raise e
|
||||
else:
|
||||
# mock_s3 was fast enough to delete before we got the request in
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Error processing HTTP request: NotFound: timeline not found"
|
||||
)
|
||||
|
||||
@@ -36,7 +36,6 @@ nom = { version = "7" }
|
||||
num-bigint = { version = "0.4" }
|
||||
num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128"] }
|
||||
parking_lot = { version = "0.12", features = ["send_guard"] }
|
||||
prost = { version = "0.11" }
|
||||
rand = { version = "0.8", features = ["small_rng"] }
|
||||
regex = { version = "1" }
|
||||
|
||||
Reference in New Issue
Block a user