mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 12:30:38 +00:00
pageserver: deletion queue & generation validation for deletions (#5207)
## Problem Pageservers must not delete objects or advertise updates to remote_consistent_lsn without checking that they hold the latest generation for the tenant in question (see [the RFC]( https://github.com/neondatabase/neon/blob/main/docs/rfcs/025-generation-numbers.md)) In this PR: - A new "deletion queue" subsystem is introduced, through which deletions flow - `RemoteTimelineClient` is modified to send deletions through the deletion queue: - For GC & compaction, deletions flow through the full generation verifying process - For timeline deletions, deletions take a fast path that bypasses generation verification - The `last_uploaded_consistent_lsn` value in `UploadQueue` is replaced with a mechanism that maintains a "projected" lsn (equivalent to the previous property), and a "visible" LSN (which is the one that we may share with safekeepers). - Until `control_plane_api` is set, all deletions skip generation validation - Tests are introduced for the new functionality in `test_pageserver_generations.py` Once this lands, if a pageserver is configured with the `control_plane_api` configuration added in https://github.com/neondatabase/neon/pull/5163, it becomes safe to attach a tenant to multiple pageservers concurrently. --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
if attach_req.pageserver_id.is_some() {
|
||||
tenant_state.generation += 1;
|
||||
}
|
||||
tenant_state.pageserver = attach_req.pageserver_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we have succesfully uploaded to remote storage
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we are advertizing to safekeepers
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn_visible: Lsn,
|
||||
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::{
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
/// As defined in S3 docs
|
||||
pub const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
/// Path on the remote storage, relative to some inner prefix.
|
||||
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl Serialize for RemotePath {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for RemotePath {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let str = String::deserialize(deserializer)?;
|
||||
Ok(Self(PathBuf::from(&str)))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RemotePath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0.display())
|
||||
@@ -88,6 +111,10 @@ impl RemotePath {
|
||||
pub fn extension(&self) -> Option<&str> {
|
||||
self.0.extension()?.to_str()
|
||||
}
|
||||
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
|
||||
@@ -33,11 +33,10 @@ use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
|
||||
|
||||
pub(super) mod metrics;
|
||||
|
||||
use self::metrics::{AttemptOutcome, RequestKind};
|
||||
@@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket {
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
|
||||
@@ -89,6 +89,22 @@ impl Generation {
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next(&self) -> Generation {
|
||||
match self {
|
||||
Self::Valid(n) => Self::Valid(*n + 1),
|
||||
Self::None => Self::Valid(1),
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into(self) -> Option<u32> {
|
||||
if let Self::Valid(v) = self {
|
||||
Some(v)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Generation {
|
||||
|
||||
@@ -24,6 +24,9 @@ pub enum ApiError {
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(Box<str>),
|
||||
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -52,6 +55,10 @@ impl ApiError {
|
||||
self.to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
),
|
||||
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
|
||||
"Shutting down".to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
deletion_queue::DeletionQueue,
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
@@ -346,9 +348,22 @@ fn start_pageserver(
|
||||
}
|
||||
};
|
||||
|
||||
// Top-level cancellation token for the process
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
remote_storage.clone(),
|
||||
ControlPlaneClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
if let Some(deletion_workers) = deletion_workers {
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
}
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
@@ -379,13 +394,13 @@ fn start_pageserver(
|
||||
};
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
conf,
|
||||
TenantSharedResources {
|
||||
broker_client: broker_client.clone(),
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
@@ -481,9 +496,10 @@ fn start_pageserver(
|
||||
http::routes::State::new(
|
||||
conf,
|
||||
http_auth.clone(),
|
||||
remote_storage,
|
||||
remote_storage.clone(),
|
||||
broker_client.clone(),
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue.new_client(),
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
@@ -611,7 +627,12 @@ fn start_pageserver(
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
@@ -475,8 +475,8 @@ impl PageServerConfigBuilder {
|
||||
self.background_task_maximum_delay = BuilderValue::Set(delay);
|
||||
}
|
||||
|
||||
pub fn control_plane_api(&mut self, api: Url) {
|
||||
self.control_plane_api = BuilderValue::Set(Some(api))
|
||||
pub fn control_plane_api(&mut self, api: Option<Url>) {
|
||||
self.control_plane_api = BuilderValue::Set(api)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
@@ -580,6 +580,27 @@ impl PageServerConf {
|
||||
self.workdir.join(TENANTS_SEGMENT_NAME)
|
||||
}
|
||||
|
||||
pub fn deletion_prefix(&self) -> PathBuf {
|
||||
self.workdir.join("deletion")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
const VERSION: u8 = 1;
|
||||
|
||||
self.deletion_prefix()
|
||||
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
|
||||
}
|
||||
|
||||
pub fn deletion_header_path(&self) -> PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
const VERSION: u8 = 1;
|
||||
|
||||
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
||||
}
|
||||
|
||||
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
@@ -747,7 +768,14 @@ impl PageServerConf {
|
||||
},
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
|
||||
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
|
||||
"control_plane_api" => {
|
||||
let parsed = parse_toml_string(key, item)?;
|
||||
if parsed.is_empty() {
|
||||
builder.control_plane_api(None)
|
||||
} else {
|
||||
builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
|
||||
}
|
||||
},
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
|
||||
use pageserver_api::control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
@@ -12,25 +14,34 @@ use utils::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
// Backoffs when control plane requests do not succeed: compromise between reducing load
|
||||
// on control plane, and retrying frequently when we are blocked on a control plane
|
||||
// response to make progress.
|
||||
const BACKOFF_INCREMENT: f64 = 0.1;
|
||||
const BACKOFF_MAX: f64 = 10.0;
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
pub(crate) struct ControlPlaneClient {
|
||||
pub struct ControlPlaneClient {
|
||||
http_client: reqwest::Client,
|
||||
base_url: Url,
|
||||
node_id: NodeId,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// Represent operations which internally retry on all errors other than
|
||||
/// cancellation token firing: the only way they can fail is ShuttingDown.
|
||||
pub enum RetryForeverError {
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ControlPlaneGenerationsApi {
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantId, Generation)>,
|
||||
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
|
||||
}
|
||||
|
||||
impl ControlPlaneClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
||||
let mut url = match conf.control_plane_api.as_ref() {
|
||||
Some(u) => u.clone(),
|
||||
None => return None,
|
||||
@@ -54,27 +65,62 @@ impl ControlPlaneClient {
|
||||
})
|
||||
}
|
||||
|
||||
async fn try_re_attach(
|
||||
async fn retry_http_forever<R, T>(
|
||||
&self,
|
||||
url: Url,
|
||||
request: &ReAttachRequest,
|
||||
) -> anyhow::Result<ReAttachResponse> {
|
||||
match self.http_client.post(url).json(request).send().await {
|
||||
Err(e) => Err(anyhow::Error::from(e)),
|
||||
Ok(r) => {
|
||||
if r.status() == StatusCode::OK {
|
||||
r.json::<ReAttachResponse>()
|
||||
.await
|
||||
.map_err(anyhow::Error::from)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
|
||||
}
|
||||
url: &url::Url,
|
||||
request: R,
|
||||
) -> Result<T, RetryForeverError>
|
||||
where
|
||||
R: Serialize,
|
||||
T: DeserializeOwned,
|
||||
{
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum RemoteAttemptError {
|
||||
#[error("shutdown")]
|
||||
Shutdown,
|
||||
#[error("remote: {0}")]
|
||||
Remote(reqwest::Error),
|
||||
}
|
||||
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.post(url.clone())
|
||||
.json(&request)
|
||||
.send()
|
||||
.await
|
||||
.map_err(RemoteAttemptError::Remote)?;
|
||||
|
||||
response
|
||||
.error_for_status_ref()
|
||||
.map_err(RemoteAttemptError::Remote)?;
|
||||
response
|
||||
.json::<T>()
|
||||
.await
|
||||
.map_err(RemoteAttemptError::Remote)
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"calling control plane generation validation API",
|
||||
backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
|
||||
Err(RemoteAttemptError::Remote(_)) => {
|
||||
panic!("We retry forever, this should never be reached");
|
||||
}
|
||||
Ok(r) => Ok(r),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Block until we get a successful response
|
||||
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
|
||||
#[async_trait::async_trait]
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
@@ -83,37 +129,47 @@ impl ControlPlaneClient {
|
||||
node_id: self.node_id,
|
||||
};
|
||||
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
|
||||
match result {
|
||||
Ok(res) => {
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
res.tenants.len()
|
||||
);
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
response.tenants.len()
|
||||
);
|
||||
|
||||
return Ok(res
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.generation)))
|
||||
.collect::<HashMap<_, _>>());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
|
||||
backoff::exponential_backoff(
|
||||
attempt,
|
||||
BACKOFF_INCREMENT,
|
||||
BACKOFF_MAX,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(anyhow::anyhow!("Shutting down"));
|
||||
}
|
||||
attempt += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.generation)))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantId, Generation)>,
|
||||
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("validate")
|
||||
.expect("Failed to build validate path");
|
||||
|
||||
let request = ValidateRequest {
|
||||
tenants: tenants
|
||||
.into_iter()
|
||||
.map(|(id, gen)| ValidateRequestTenant {
|
||||
id,
|
||||
gen: gen
|
||||
.into()
|
||||
.expect("Generation should always be valid for a Tenant doing deletions"),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|rt| (rt.id, rt.valid))
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
1312
pageserver/src/deletion_queue.rs
Normal file
1312
pageserver/src/deletion_queue.rs
Normal file
File diff suppressed because it is too large
Load Diff
156
pageserver/src/deletion_queue/deleter.rs
Normal file
156
pageserver/src/deletion_queue/deleter.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
//! The deleter is the final stage in the deletion queue. It accumulates remote
|
||||
//! paths to delete, and periodically executes them in batches of up to 1000
|
||||
//! using the DeleteObjects request.
|
||||
//!
|
||||
//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
|
||||
//! number of full-sized DeleteObjects requests, rather than a larger number of
|
||||
//! smaller requests.
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::metrics;
|
||||
|
||||
use super::DeletionQueueError;
|
||||
use super::FlushOp;
|
||||
|
||||
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
pub(super) enum DeleterMessage {
|
||||
Delete(Vec<RemotePath>),
|
||||
Flush(FlushOp),
|
||||
}
|
||||
|
||||
/// Non-persistent deletion queue, for coalescing multiple object deletes into
|
||||
/// larger DeleteObjects requests.
|
||||
pub(super) struct Deleter {
|
||||
// Accumulate up to 1000 keys for the next deletion operation
|
||||
accumulator: Vec<RemotePath>,
|
||||
|
||||
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
}
|
||||
|
||||
impl Deleter {
|
||||
pub(super) fn new(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
remote_storage,
|
||||
rx,
|
||||
cancel,
|
||||
accumulator: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrap the remote `delete_objects` with a failpoint
|
||||
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
|
||||
fail::fail_point!("deletion-queue-before-execute", |_| {
|
||||
info!("Skipping execution, failpoint set");
|
||||
metrics::DELETION_QUEUE
|
||||
.remote_errors
|
||||
.with_label_values(&["failpoint"])
|
||||
.inc();
|
||||
Err(anyhow::anyhow!("failpoint hit"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
}
|
||||
|
||||
/// Block until everything in accumulator has been executed
|
||||
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
|
||||
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
|
||||
match self.remote_delete().await {
|
||||
Ok(()) => {
|
||||
// Note: we assume that the remote storage layer returns Ok(()) if some
|
||||
// or all of the deleted objects were already gone.
|
||||
metrics::DELETION_QUEUE
|
||||
.keys_executed
|
||||
.inc_by(self.accumulator.len() as u64);
|
||||
info!(
|
||||
"Executed deletion batch {}..{}",
|
||||
self.accumulator
|
||||
.first()
|
||||
.expect("accumulator should be non-empty"),
|
||||
self.accumulator
|
||||
.last()
|
||||
.expect("accumulator should be non-empty"),
|
||||
);
|
||||
self.accumulator.clear();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("DeleteObjects request failed: {e:#}, will retry");
|
||||
metrics::DELETION_QUEUE
|
||||
.remote_errors
|
||||
.with_label_values(&["execute"])
|
||||
.inc();
|
||||
}
|
||||
};
|
||||
}
|
||||
if self.cancel.is_cancelled() {
|
||||
// Expose an error because we may not have actually flushed everything
|
||||
Err(DeletionQueueError::ShuttingDown)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
|
||||
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
|
||||
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
|
||||
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
|
||||
Ok(Some(m)) => m,
|
||||
Ok(None) => {
|
||||
// All queue senders closed
|
||||
info!("Shutting down");
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
|
||||
// return immediately if no work is pending
|
||||
self.flush().await?;
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match msg {
|
||||
DeleterMessage::Delete(mut list) => {
|
||||
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
self.flush().await?;
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
assert_eq!(self.accumulator.len(), 0);
|
||||
}
|
||||
|
||||
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
|
||||
let take_count = std::cmp::min(available_slots, list.len());
|
||||
for path in list.drain(list.len() - take_count..) {
|
||||
self.accumulator.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
DeleterMessage::Flush(flush_op) => {
|
||||
// If flush() errors, we drop the flush_op and the caller will get
|
||||
// an error recv()'ing their oneshot channel.
|
||||
self.flush().await?;
|
||||
flush_op.notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
487
pageserver/src/deletion_queue/list_writer.rs
Normal file
487
pageserver/src/deletion_queue/list_writer.rs
Normal file
@@ -0,0 +1,487 @@
|
||||
//! The list writer is the first stage in the deletion queue. It accumulates
|
||||
//! layers to delete, and periodically writes out these layers into a persistent
|
||||
//! DeletionList.
|
||||
//!
|
||||
//! The purpose of writing DeletionLists is to decouple the decision to
|
||||
//! delete an object from the validation required to execute it: even if
|
||||
//! validation is not possible, e.g. due to a control plane outage, we can
|
||||
//! still persist our intent to delete an object, in a way that would
|
||||
//! survive a restart.
|
||||
//!
|
||||
//! DeletionLists are passed onwards to the Validator.
|
||||
|
||||
use super::DeletionHeader;
|
||||
use super::DeletionList;
|
||||
use super::FlushOp;
|
||||
use super::ValidatorQueueMessage;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs::create_dir_all;
|
||||
use std::time::Duration;
|
||||
|
||||
use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::deletion_queue::TEMP_SUFFIX;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
// of magnitude 1MB when we are under heavy delete load.
|
||||
const DELETION_LIST_TARGET_SIZE: usize = 16384;
|
||||
|
||||
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
|
||||
// which we might leak objects from not flushing a DeletionList after
|
||||
// the objects are already unlinked from timeline metadata.
|
||||
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
|
||||
|
||||
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
|
||||
// more objects before doing the flush.
|
||||
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct DeletionOp {
|
||||
pub(super) tenant_id: TenantId,
|
||||
pub(super) timeline_id: TimelineId,
|
||||
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
||||
// have a config object handy to project it to a remote key, and need the consuming worker
|
||||
// to do it for you.
|
||||
pub(super) layers: Vec<(LayerFileName, Generation)>,
|
||||
pub(super) objects: Vec<RemotePath>,
|
||||
|
||||
/// The _current_ generation of the Tenant attachment in which we are enqueuing
|
||||
/// this deletion.
|
||||
pub(super) generation: Generation,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct RecoverOp {
|
||||
pub(super) attached_tenants: HashMap<TenantId, Generation>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) enum ListWriterQueueMessage {
|
||||
Delete(DeletionOp),
|
||||
// Wait until all prior deletions make it into a persistent DeletionList
|
||||
Flush(FlushOp),
|
||||
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
|
||||
FlushExecute(FlushOp),
|
||||
// Call once after re-attaching to control plane, to notify the deletion queue about
|
||||
// latest attached generations & load any saved deletion lists from disk.
|
||||
Recover(RecoverOp),
|
||||
}
|
||||
|
||||
pub(super) struct ListWriter {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
// Incoming frontend requests to delete some keys
|
||||
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
|
||||
|
||||
// Outbound requests to the backend to execute deletion lists we have composed.
|
||||
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
|
||||
|
||||
// The list we are currently building, contains a buffer of keys to delete
|
||||
// and our next sequence number
|
||||
pending: DeletionList,
|
||||
|
||||
// These FlushOps should notify the next time we flush
|
||||
pending_flushes: Vec<FlushOp>,
|
||||
|
||||
// Worker loop is torn down when this fires.
|
||||
cancel: CancellationToken,
|
||||
|
||||
// Safety guard to do recovery exactly once
|
||||
recovered: bool,
|
||||
}
|
||||
|
||||
impl ListWriter {
|
||||
// Initially DeletionHeader.validated_sequence is zero. The place we start our
|
||||
// sequence numbers must be higher than that.
|
||||
const BASE_SEQUENCE: u64 = 1;
|
||||
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
pending: DeletionList::new(Self::BASE_SEQUENCE),
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
pending_flushes: Vec::new(),
|
||||
cancel,
|
||||
recovered: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to flush `list` to persistent storage
|
||||
///
|
||||
/// This does not return errors, because on failure to flush we do not lose
|
||||
/// any state: flushing will be retried implicitly on the next deadline
|
||||
async fn flush(&mut self) {
|
||||
if self.pending.is_empty() {
|
||||
for f in self.pending_flushes.drain(..) {
|
||||
f.notify();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
match self.pending.save(self.conf).await {
|
||||
Ok(_) => {
|
||||
info!(sequence = self.pending.sequence, "Stored deletion list");
|
||||
|
||||
for f in self.pending_flushes.drain(..) {
|
||||
f.notify();
|
||||
}
|
||||
|
||||
// Take the list we've accumulated, replace it with a fresh list for the next sequence
|
||||
let next_list = DeletionList::new(self.pending.sequence + 1);
|
||||
let list = std::mem::replace(&mut self.pending, next_list);
|
||||
|
||||
if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
|
||||
// This is allowed to fail: it will only happen if the backend worker is shut down,
|
||||
// so we can just drop this on the floor.
|
||||
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
warn!(
|
||||
sequence = self.pending.sequence,
|
||||
"Failed to write deletion list, will retry later ({e:#})"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Load the header, to learn the sequence number up to which deletions
|
||||
/// have been validated. We will apply validated=true to DeletionLists
|
||||
/// <= this sequence when loading them.
|
||||
///
|
||||
/// It is not an error for the header to not exist: we return None, and
|
||||
/// the caller should act as if validated_sequence is 0
|
||||
async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
match tokio::fs::read(&header_path).await {
|
||||
Ok(header_bytes) => {
|
||||
match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
|
||||
Ok(h) => Ok(Some(h.validated_sequence)),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to deserialize deletion header, ignoring {}: {e:#}",
|
||||
header_path.display()
|
||||
);
|
||||
// This should never happen unless we make a mistake with our serialization.
|
||||
// Ignoring a deletion header is not consequential for correctnes because all deletions
|
||||
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
debug!(
|
||||
"Deletion header {} not found, first start?",
|
||||
header_path.display()
|
||||
);
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover(
|
||||
&mut self,
|
||||
attached_tenants: HashMap<TenantId, Generation>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
debug!(
|
||||
"recovering with {} attached tenants",
|
||||
attached_tenants.len()
|
||||
);
|
||||
|
||||
// Load the header
|
||||
let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
|
||||
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
|
||||
let deletion_directory = self.conf.deletion_prefix();
|
||||
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to open deletion list directory {}: {e:#}",
|
||||
deletion_directory.display(),
|
||||
);
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
let list_name_pattern =
|
||||
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
|
||||
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
let file_name = dentry.file_name();
|
||||
let dentry_str = file_name.to_string_lossy();
|
||||
|
||||
if Some(file_name.as_os_str()) == header_path.file_name() {
|
||||
// Don't try and parse the header's name like a list
|
||||
continue;
|
||||
}
|
||||
|
||||
if dentry_str.ends_with(TEMP_SUFFIX) {
|
||||
info!("Cleaning up temporary file {dentry_str}");
|
||||
let absolute_path = deletion_directory.join(dentry.file_name());
|
||||
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
|
||||
// Non-fatal error: we will just leave the file behind but not
|
||||
// try and load it.
|
||||
warn!(
|
||||
"Failed to clean up temporary file {}: {e:#}",
|
||||
absolute_path.display()
|
||||
);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_name = dentry.file_name().to_owned();
|
||||
let basename = file_name.to_string_lossy();
|
||||
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
|
||||
m.name("sequence")
|
||||
.expect("Non optional group should be present")
|
||||
.as_str()
|
||||
} else {
|
||||
warn!("Unexpected key in deletion queue: {basename}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
continue;
|
||||
};
|
||||
|
||||
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
warn!("Malformed key '{basename}': {e}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
seqs.push(seq);
|
||||
}
|
||||
seqs.sort();
|
||||
|
||||
// Start our next deletion list from after the last location validated by
|
||||
// previous process lifetime, or after the last location found (it is updated
|
||||
// below after enumerating the deletion lists)
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
if let Some(max_list_seq) = seqs.last() {
|
||||
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
|
||||
}
|
||||
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
|
||||
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
// Drop the list on the floor: any objects it referenced will be left behind
|
||||
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
|
||||
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if deletion_list.sequence <= validated_sequence {
|
||||
// If the deletion list falls below valid_seq, we may assume that it was
|
||||
// already validated the last time this pageserver ran. Otherwise, we still
|
||||
// load it, as it may still contain content valid in this generation.
|
||||
deletion_list.validated = true;
|
||||
} else {
|
||||
// Special case optimization: if a tenant is still attached, and no other
|
||||
// generation was issued to another node in the interval while we restarted,
|
||||
// then we may treat deletion lists from the previous generation as if they
|
||||
// belong to our currently attached generation, and proceed to validate & execute.
|
||||
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
|
||||
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
|
||||
if attached_gen.previous() == tenant_list.generation {
|
||||
tenant_list.generation = *attached_gen;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
validated = deletion_list.validated,
|
||||
sequence = deletion_list.sequence,
|
||||
"Recovered deletion list"
|
||||
);
|
||||
|
||||
// We will drop out of recovery if this fails: it indicates that we are shutting down
|
||||
// or the backend has panicked
|
||||
metrics::DELETION_QUEUE
|
||||
.keys_submitted
|
||||
.inc_by(deletion_list.len() as u64);
|
||||
self.tx
|
||||
.send(ValidatorQueueMessage::Delete(deletion_list))
|
||||
.await?;
|
||||
}
|
||||
|
||||
info!(next_sequence = self.pending.sequence, "Replay complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
|
||||
/// and write them out, for later validation by the backend and execution by the executor.
|
||||
pub(super) async fn background(&mut self) {
|
||||
info!("Started deletion frontend worker");
|
||||
|
||||
// Synchronous, but we only do it once per process lifetime so it's tolerable
|
||||
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
|
||||
tracing::error!(
|
||||
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
|
||||
self.conf.deletion_prefix().display()
|
||||
);
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
while !self.cancel.is_cancelled() {
|
||||
let timeout = if self.pending_flushes.is_empty() {
|
||||
FRONTEND_DEFAULT_TIMEOUT
|
||||
} else {
|
||||
FRONTEND_FLUSHING_TIMEOUT
|
||||
};
|
||||
|
||||
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
|
||||
Ok(Some(msg)) => msg,
|
||||
Ok(None) => {
|
||||
// Queue sender destroyed, shutting down
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
// Hit deadline, flush.
|
||||
self.flush().await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match msg {
|
||||
ListWriterQueueMessage::Delete(op) => {
|
||||
assert!(
|
||||
self.recovered,
|
||||
"Cannot process deletions before recovery. This is a bug."
|
||||
);
|
||||
|
||||
debug!(
|
||||
"Delete: ingesting {} layers, {} other objects",
|
||||
op.layers.len(),
|
||||
op.objects.len()
|
||||
);
|
||||
|
||||
let mut layer_paths = Vec::new();
|
||||
for (layer, generation) in op.layers {
|
||||
layer_paths.push(remote_layer_path(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
&layer,
|
||||
generation,
|
||||
));
|
||||
}
|
||||
layer_paths.extend(op.objects);
|
||||
|
||||
if !self.pending.push(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
op.generation,
|
||||
&mut layer_paths,
|
||||
) {
|
||||
self.flush().await;
|
||||
let retry_succeeded = self.pending.push(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
op.generation,
|
||||
&mut layer_paths,
|
||||
);
|
||||
if !retry_succeeded {
|
||||
// Unexpected: after we flush, we should have
|
||||
// drained self.pending, so a conflict on
|
||||
// generation numbers should be impossible.
|
||||
tracing::error!(
|
||||
"Failed to enqueue deletions, leaking objects. This is a bug."
|
||||
);
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
ListWriterQueueMessage::Flush(op) => {
|
||||
if self.pending.is_empty() {
|
||||
// Execute immediately
|
||||
debug!("Flush: No pending objects, flushing immediately");
|
||||
op.notify()
|
||||
} else {
|
||||
// Execute next time we flush
|
||||
debug!("Flush: adding to pending flush list for next deadline flush");
|
||||
self.pending_flushes.push(op);
|
||||
}
|
||||
}
|
||||
ListWriterQueueMessage::FlushExecute(op) => {
|
||||
debug!("FlushExecute: passing through to backend");
|
||||
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
|
||||
if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
|
||||
info!("Can't flush, shutting down ({e})");
|
||||
// Caller will get error when their oneshot sender was dropped.
|
||||
}
|
||||
}
|
||||
ListWriterQueueMessage::Recover(op) => {
|
||||
if self.recovered {
|
||||
tracing::error!(
|
||||
"Deletion queue recovery called more than once. This is a bug."
|
||||
);
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
// Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Err(e) = self.recover(op.attached_tenants).await {
|
||||
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
|
||||
// queue receiver has been dropped, or something is critically broken with
|
||||
// the local filesystem holding deletion lists.
|
||||
info!(
|
||||
"Deletion queue recover aborted, deletion queue will not proceed ({e})"
|
||||
);
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
return;
|
||||
} else {
|
||||
self.recovered = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
|
||||
self.flush().await;
|
||||
}
|
||||
}
|
||||
info!("Deletion queue shut down.");
|
||||
}
|
||||
}
|
||||
414
pageserver/src/deletion_queue/validator.rs
Normal file
414
pageserver/src/deletion_queue/validator.rs
Normal file
@@ -0,0 +1,414 @@
|
||||
//! The validator is responsible for validating DeletionLists for execution,
|
||||
//! based on whethe the generation in the DeletionList is still the latest
|
||||
//! generation for a tenant.
|
||||
//!
|
||||
//! The purpose of validation is to ensure split-brain safety in the cluster
|
||||
//! of pageservers: a deletion may only be executed if the tenant generation
|
||||
//! that originated it is still current. See docs/rfcs/025-generation-numbers.md
|
||||
//! The purpose of accumulating lists before validating them is to reduce load
|
||||
//! on the control plane API by issuing fewer, larger requests.
|
||||
//!
|
||||
//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
|
||||
//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
|
||||
//! to decide when old
|
||||
//!
|
||||
//! Deletions are passed onward to the Deleter.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
|
||||
use super::deleter::DeleterMessage;
|
||||
use super::DeletionHeader;
|
||||
use super::DeletionList;
|
||||
use super::DeletionQueueError;
|
||||
use super::FlushOp;
|
||||
use super::VisibleLsnUpdates;
|
||||
|
||||
// After this length of time, do any validation work that is pending,
|
||||
// even if we haven't accumulated many keys to delete.
|
||||
//
|
||||
// This also causes updates to remote_consistent_lsn to be validated, even
|
||||
// if there were no deletions enqueued.
|
||||
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
const AUTOFLUSH_KEY_COUNT: usize = 16384;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) enum ValidatorQueueMessage {
|
||||
Delete(DeletionList),
|
||||
Flush(FlushOp),
|
||||
}
|
||||
pub(super) struct Validator<C>
|
||||
where
|
||||
C: ControlPlaneGenerationsApi,
|
||||
{
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
// Client for calling into control plane API for validation of deletes
|
||||
control_plane_client: Option<C>,
|
||||
|
||||
// DeletionLists which are waiting generation validation. Not safe to
|
||||
// execute until [`validate`] has processed them.
|
||||
pending_lists: Vec<DeletionList>,
|
||||
|
||||
// DeletionLists which have passed validation and are ready to execute.
|
||||
validated_lists: Vec<DeletionList>,
|
||||
|
||||
// Sum of all the lengths of lists in pending_lists
|
||||
pending_key_count: usize,
|
||||
|
||||
// Lsn validation state: we read projected LSNs and write back visible LSNs
|
||||
// after validation. This is the LSN equivalent of `pending_validation_lists`:
|
||||
// it is drained in [`validate`]
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
|
||||
// If we failed to rewrite a deletion list due to local filesystem I/O failure,
|
||||
// we must remember that and refuse to advance our persistent validated sequence
|
||||
// number past the failure.
|
||||
list_write_failed: Option<u64>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl<C> Validator<C>
|
||||
where
|
||||
C: ControlPlaneGenerationsApi,
|
||||
{
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
control_plane_client: Option<C>,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
control_plane_client,
|
||||
lsn_table,
|
||||
pending_lists: Vec::new(),
|
||||
validated_lists: Vec::new(),
|
||||
pending_key_count: 0,
|
||||
list_write_failed: None,
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
/// Process any outstanding validations of generations of pending LSN updates or pending
|
||||
/// DeletionLists.
|
||||
///
|
||||
/// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
|
||||
/// go into the queue of ready-to-execute lists.
|
||||
async fn validate(&mut self) -> Result<(), DeletionQueueError> {
|
||||
let mut tenant_generations = HashMap::new();
|
||||
for list in &self.pending_lists {
|
||||
for (tenant_id, tenant_list) in &list.tenants {
|
||||
// Note: DeletionLists are in logical time order, so generation always
|
||||
// goes up. By doing a simple insert() we will always end up with
|
||||
// the latest generation seen for a tenant.
|
||||
tenant_generations.insert(*tenant_id, tenant_list.generation);
|
||||
}
|
||||
}
|
||||
|
||||
let pending_lsn_updates = {
|
||||
let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
|
||||
std::mem::take(&mut *lsn_table)
|
||||
};
|
||||
for (tenant_id, update) in &pending_lsn_updates.tenants {
|
||||
let entry = tenant_generations
|
||||
.entry(*tenant_id)
|
||||
.or_insert(update.generation);
|
||||
if update.generation > *entry {
|
||||
*entry = update.generation;
|
||||
}
|
||||
}
|
||||
|
||||
if tenant_generations.is_empty() {
|
||||
// No work to do
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
|
||||
match control_plane_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
// The only way a validation call returns an error is when the cancellation token fires
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Control plane API disabled. In legacy mode we consider everything valid.
|
||||
tenant_generations.keys().map(|k| (*k, true)).collect()
|
||||
};
|
||||
|
||||
let mut validated_sequence: Option<u64> = None;
|
||||
|
||||
// Apply the validation results to the pending LSN updates
|
||||
for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
|
||||
let validated_generation = tenant_generations
|
||||
.get(&tenant_id)
|
||||
.expect("Map was built from the same keys we're reading");
|
||||
|
||||
let valid = tenants_valid
|
||||
.get(&tenant_id)
|
||||
.copied()
|
||||
// If the tenant was missing from the validation response, it has been deleted.
|
||||
// The Timeline that requested the LSN update is probably already torn down,
|
||||
// or will be torn down soon. In this case, drop the update by setting valid=false.
|
||||
.unwrap_or(false);
|
||||
|
||||
if valid && *validated_generation == tenant_lsn_state.generation {
|
||||
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
|
||||
pending_lsn.result_slot.store(pending_lsn.projected);
|
||||
}
|
||||
} else {
|
||||
// If we failed validation, then do not apply any of the projected updates
|
||||
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
|
||||
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// Apply the validation results to the pending deletion lists
|
||||
for list in &mut self.pending_lists {
|
||||
// Filter the list based on whether the server responded valid: true.
|
||||
// If a tenant is omitted in the response, it has been deleted, and we should
|
||||
// proceed with deletion.
|
||||
let mut mutated = false;
|
||||
list.tenants.retain(|tenant_id, tenant| {
|
||||
let validated_generation = tenant_generations
|
||||
.get(tenant_id)
|
||||
.expect("Map was built from the same keys we're reading");
|
||||
|
||||
// If the tenant was missing from the validation response, it has been deleted.
|
||||
// This means that a deletion is valid, but also redundant since the tenant's
|
||||
// objects should have already been deleted. Treat it as invalid to drop the
|
||||
// redundant deletion.
|
||||
let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
|
||||
|
||||
// A list is valid if it comes from the current _or previous_ generation.
|
||||
// - The previous generation case is permitted due to how we store deletion lists locally:
|
||||
// if we see the immediately previous generation in a locally stored deletion list,
|
||||
// it proves that this node's disk was used for both current & previous generations,
|
||||
// and therefore no other node was involved in between: the two generations may be
|
||||
// logically treated as the same.
|
||||
// - In that previous generation case, we rewrote it to the current generation
|
||||
// in recover(), so the comparison here is simply an equality.
|
||||
|
||||
let this_list_valid = valid
|
||||
&& (tenant.generation == *validated_generation);
|
||||
|
||||
if !this_list_valid {
|
||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||
mutated = true;
|
||||
}
|
||||
this_list_valid
|
||||
});
|
||||
list.validated = true;
|
||||
|
||||
if mutated {
|
||||
// Save the deletion list if we had to make changes due to stale generations. The
|
||||
// saved list is valid for execution.
|
||||
if let Err(e) = list.save(self.conf).await {
|
||||
// Highly unexpected. Could happen if e.g. disk full.
|
||||
// If we didn't save the trimmed list, it is _not_ valid to execute.
|
||||
warn!("Failed to save modified deletion list {list}: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
|
||||
// Rather than have a complex retry process, just drop it and leak the objects,
|
||||
// scrubber will clean up eventually.
|
||||
list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
|
||||
|
||||
// We must remember this failure, to prevent later writing out a header that
|
||||
// would imply the unwritable list was valid on disk.
|
||||
if self.list_write_failed.is_none() {
|
||||
self.list_write_failed = Some(list.sequence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
validated_sequence = Some(list.sequence);
|
||||
}
|
||||
|
||||
if let Some(validated_sequence) = validated_sequence {
|
||||
if let Some(list_write_failed) = self.list_write_failed {
|
||||
// Rare error case: we failed to write out a deletion list to excise invalid
|
||||
// entries, so we cannot advance the header's valid sequence number past that point.
|
||||
//
|
||||
// In this state we will continue to validate, execute and delete deletion lists,
|
||||
// we just cannot update the header. It should be noticed and fixed by a human due to
|
||||
// the nonzero value of our unexpected_errors metric.
|
||||
warn!(
|
||||
sequence_number = list_write_failed,
|
||||
"Cannot write header because writing a deletion list failed earlier",
|
||||
);
|
||||
} else {
|
||||
// Write the queue header to record how far validation progressed. This avoids having
|
||||
// to rewrite each DeletionList to set validated=true in it.
|
||||
let header = DeletionHeader::new(validated_sequence);
|
||||
|
||||
// Drop result because the validated_sequence is an optimization. If we fail to save it,
|
||||
// then restart, we will drop some deletion lists, creating work for scrubber.
|
||||
// The save() function logs a warning on error.
|
||||
if let Err(e) = header.save(self.conf).await {
|
||||
warn!("Failed to write deletion queue header: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Transfer the validated lists to the validated queue, for eventual execution
|
||||
self.validated_lists.append(&mut self.pending_lists);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
|
||||
for list_path in list_paths {
|
||||
debug!("Removing deletion list {}", list_path.display());
|
||||
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files. We will leave the file behind. Subsequent
|
||||
// pageservers will try and load it again: hopefully whatever storage
|
||||
// issue (probably permissions) has been fixed by then.
|
||||
tracing::error!("Failed to delete {}: {e:#}", list_path.display());
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
|
||||
tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
|
||||
|
||||
// Issue any required generation validation calls to the control plane
|
||||
self.validate().await?;
|
||||
|
||||
// After successful validation, nothing is pending: any lists that
|
||||
// made it through validation will be in validated_lists.
|
||||
assert!(self.pending_lists.is_empty());
|
||||
self.pending_key_count = 0;
|
||||
|
||||
tracing::debug!(
|
||||
"Validation complete, have {} validated lists",
|
||||
self.validated_lists.len()
|
||||
);
|
||||
|
||||
// Return quickly if we have no validated lists to execute. This avoids flushing the
|
||||
// executor when an idle backend hits its autoflush interval
|
||||
if self.validated_lists.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Drain `validated_lists` into the executor
|
||||
let mut executing_lists = Vec::new();
|
||||
for list in self.validated_lists.drain(..) {
|
||||
let list_path = self.conf.deletion_list_path(list.sequence);
|
||||
let objects = list.into_remote_paths();
|
||||
self.tx
|
||||
.send(DeleterMessage::Delete(objects))
|
||||
.await
|
||||
.map_err(|_| DeletionQueueError::ShuttingDown)?;
|
||||
executing_lists.push(list_path);
|
||||
}
|
||||
|
||||
self.flush_executor().await?;
|
||||
|
||||
// Erase the deletion lists whose keys have all be deleted from remote storage
|
||||
self.cleanup_lists(executing_lists).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
|
||||
// Flush the executor, so that all the keys referenced by these deletion lists
|
||||
// are actually removed from remote storage. This is a precondition to deleting
|
||||
// the deletion lists themselves.
|
||||
let (flush_op, rx) = FlushOp::new();
|
||||
self.tx
|
||||
.send(DeleterMessage::Flush(flush_op))
|
||||
.await
|
||||
.map_err(|_| DeletionQueueError::ShuttingDown)?;
|
||||
|
||||
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
|
||||
}
|
||||
|
||||
pub(super) async fn background(&mut self) {
|
||||
tracing::info!("Started deletion backend worker");
|
||||
|
||||
while !self.cancel.is_cancelled() {
|
||||
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
|
||||
Ok(Some(m)) => m,
|
||||
Ok(None) => {
|
||||
// All queue senders closed
|
||||
info!("Shutting down");
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
|
||||
// return immediately if no work is pending.
|
||||
match self.flush().await {
|
||||
Ok(()) => {}
|
||||
Err(DeletionQueueError::ShuttingDown) => {
|
||||
// If we are shutting down, then auto-flush can safely be skipped
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match msg {
|
||||
ValidatorQueueMessage::Delete(list) => {
|
||||
if list.validated {
|
||||
// A pre-validated list may only be seen during recovery, if we are recovering
|
||||
// a DeletionList whose on-disk state has validated=true
|
||||
self.validated_lists.push(list)
|
||||
} else {
|
||||
self.pending_key_count += list.len();
|
||||
self.pending_lists.push(list);
|
||||
}
|
||||
|
||||
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
|
||||
match self.flush().await {
|
||||
Ok(()) => {}
|
||||
Err(DeletionQueueError::ShuttingDown) => {
|
||||
// If we are shutting down, then auto-flush can safely be skipped
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ValidatorQueueMessage::Flush(op) => {
|
||||
match self.flush().await {
|
||||
Ok(()) => {
|
||||
op.notify();
|
||||
}
|
||||
Err(DeletionQueueError::ShuttingDown) => {
|
||||
// If we fail due to shutting down, we will just drop `op` to propagate that status.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1093,6 +1093,9 @@ components:
|
||||
remote_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
remote_consistent_lsn_visible:
|
||||
type: string
|
||||
format: hex
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::TryFutureExt;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
@@ -24,6 +25,7 @@ use super::models::{
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use utils::{
|
||||
@@ -61,6 +63,7 @@ pub struct State {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -70,6 +73,7 @@ impl State {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
@@ -82,8 +86,17 @@ impl State {
|
||||
remote_storage,
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue_client,
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_resources(&self) -> TenantSharedResources {
|
||||
TenantSharedResources {
|
||||
broker_client: self.broker_client.clone(),
|
||||
remote_storage: self.remote_storage.clone(),
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
.get_remote_consistent_lsn_projected()
|
||||
.unwrap_or(Lsn(0));
|
||||
let remote_consistent_lsn_visible = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
let walreceiver_status = timeline.walreceiver_status();
|
||||
|
||||
@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||
remote_consistent_lsn,
|
||||
remote_consistent_lsn: remote_consistent_lsn_projected,
|
||||
remote_consistent_lsn_visible,
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(
|
||||
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
tenant_conf,
|
||||
state.broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
.await?;
|
||||
} else {
|
||||
if state.remote_storage.is_none() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||
)));
|
||||
}
|
||||
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
tenant_conf,
|
||||
state.tenant_resources(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
@@ -570,6 +588,7 @@ async fn tenant_load_handler(
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
state.deletion_queue_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
state.tenant_resources(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
async fn deletion_queue_flush(
|
||||
r: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&r);
|
||||
|
||||
if state.remote_storage.is_none() {
|
||||
// Nothing to do if remote storage is disabled.
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
|
||||
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
|
||||
|
||||
let flush = async {
|
||||
if execute {
|
||||
state.deletion_queue_client.flush_execute().await
|
||||
} else {
|
||||
state.deletion_queue_client.flush().await
|
||||
}
|
||||
}
|
||||
// DeletionQueueError's only case is shutting down.
|
||||
.map_err(|_| ApiError::ShuttingDown);
|
||||
|
||||
tokio::select! {
|
||||
res = flush => {
|
||||
res.map(|()| json_response(StatusCode::OK, ()))?
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
Err(ApiError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn active_timeline_of_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -1463,6 +1514,9 @@ pub fn make_router(
|
||||
.put("/v1/disk_usage_eviction/run", |r| {
|
||||
api_handler(r, disk_usage_eviction_run)
|
||||
})
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -3,7 +3,8 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
mod control_plane_client;
|
||||
pub mod control_plane_client;
|
||||
pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
@@ -27,6 +28,7 @@ pub mod failpoint_support;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
)
|
||||
.await;
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
if let Some(mut deletion_queue) = deletion_queue {
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
}
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
|
||||
@@ -887,6 +887,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct DeletionQueueMetrics {
|
||||
pub(crate) keys_submitted: IntCounter,
|
||||
pub(crate) keys_dropped: IntCounter,
|
||||
pub(crate) keys_executed: IntCounter,
|
||||
pub(crate) dropped_lsn_updates: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
pub(crate) remote_errors: IntCounterVec,
|
||||
}
|
||||
pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
DeletionQueueMetrics{
|
||||
|
||||
keys_submitted: register_int_counter!(
|
||||
"pageserver_deletion_queue_submitted_total",
|
||||
"Number of objects submitted for deletion"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
keys_dropped: register_int_counter!(
|
||||
"pageserver_deletion_queue_dropped_total",
|
||||
"Number of object deletions dropped due to stale generation."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
keys_executed: register_int_counter!(
|
||||
"pageserver_deletion_queue_executed_total",
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
dropped_lsn_updates: register_int_counter!(
|
||||
"pageserver_deletion_queue_dropped_lsn_updates_total",
|
||||
"Updates to remote_consistent_lsn dropped due to stale generation number."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
unexpected_errors: register_int_counter!(
|
||||
"pageserver_deletion_queue_unexpected_errors_total",
|
||||
"Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
remote_errors: register_int_counter_vec!(
|
||||
"pageserver_deletion_queue_remote_errors_total",
|
||||
"Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
|
||||
&["op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
}
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -1675,6 +1723,9 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// Deletion queue stats
|
||||
Lazy::force(&DELETION_QUEUE);
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -37,7 +37,7 @@ impl Key {
|
||||
| self.field6 as i128
|
||||
}
|
||||
|
||||
pub fn from_i128(x: i128) -> Self {
|
||||
pub const fn from_i128(x: i128) -> Self {
|
||||
Key {
|
||||
field1: ((x >> 120) & 0xf) as u8,
|
||||
field2: ((x >> 104) & 0xFFFF) as u32,
|
||||
|
||||
@@ -456,7 +456,7 @@ async fn task_finish(
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
shutdown_pageserver(1).await;
|
||||
shutdown_pageserver(None, 1).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT_ACTIVATION;
|
||||
@@ -117,7 +118,7 @@ mod span;
|
||||
|
||||
pub mod metadata;
|
||||
mod par_fsync;
|
||||
mod remote_timeline_client;
|
||||
pub mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod config;
|
||||
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
pub struct TenantSharedResources {
|
||||
pub broker_client: storage_broker::BrokerClientChannel,
|
||||
pub remote_storage: Option<GenericRemoteStorage>,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -197,6 +199,9 @@ pub struct Tenant {
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
pub(crate) remote_storage: Option<GenericRemoteStorage>,
|
||||
|
||||
// Access to global deletion queue for when this tenant wants to schedule a deletion
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
@@ -523,15 +528,20 @@ impl Tenant {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
resources: TenantSharedResources,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO dedup with spawn_load
|
||||
let tenant_conf =
|
||||
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
|
||||
|
||||
let TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
} = resources;
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Attaching,
|
||||
@@ -540,7 +550,8 @@ impl Tenant {
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
generation,
|
||||
Some(remote_storage.clone()),
|
||||
remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
));
|
||||
|
||||
// Do all the hard work in the background
|
||||
@@ -571,7 +582,7 @@ impl Tenant {
|
||||
let pending_deletion = {
|
||||
match DeleteTenantFlow::should_resume_deletion(
|
||||
conf,
|
||||
Some(&remote_storage),
|
||||
remote_storage.as_ref(),
|
||||
&tenant_clone,
|
||||
)
|
||||
.await
|
||||
@@ -660,6 +671,7 @@ impl Tenant {
|
||||
for timeline_id in remote_timeline_ids {
|
||||
let client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
@@ -726,6 +738,7 @@ impl Tenant {
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client: Some(remote_client),
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
@@ -750,6 +763,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&index_part.metadata,
|
||||
Some(remote_timeline_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
@@ -851,6 +865,7 @@ impl Tenant {
|
||||
tenant_id,
|
||||
Generation::broken(),
|
||||
None,
|
||||
DeletionQueueClient::broken(),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -895,6 +910,7 @@ impl Tenant {
|
||||
tenant_id,
|
||||
generation,
|
||||
remote_storage.clone(),
|
||||
resources.deletion_queue_client.clone(),
|
||||
);
|
||||
let tenant = Arc::new(tenant);
|
||||
|
||||
@@ -1302,6 +1318,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&local_metadata,
|
||||
Some(remote_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
@@ -1351,6 +1368,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&local_metadata,
|
||||
None,
|
||||
self.deletion_queue_client.clone(),
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
@@ -2242,6 +2260,9 @@ impl Tenant {
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
// Allow too_many_arguments because a constructor's argument list naturally grows with the
|
||||
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn new(
|
||||
state: TenantState,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -2250,6 +2271,7 @@ impl Tenant {
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
) -> Tenant {
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
@@ -2317,6 +2339,7 @@ impl Tenant {
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
@@ -2856,6 +2879,7 @@ impl Tenant {
|
||||
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
|
||||
let remote_client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
@@ -2866,7 +2890,10 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
TimelineResources { remote_client }
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates intermediate timeline structure and its files.
|
||||
@@ -3322,6 +3349,7 @@ pub mod harness {
|
||||
use utils::logging;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
repository::Key,
|
||||
@@ -3383,6 +3411,7 @@ pub mod harness {
|
||||
pub generation: Generation,
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub remote_fs_dir: PathBuf,
|
||||
pub deletion_queue: MockDeletionQueue,
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
@@ -3431,6 +3460,7 @@ pub mod harness {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
@@ -3439,6 +3469,7 @@ pub mod harness {
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
remote_storage,
|
||||
remote_fs_dir,
|
||||
deletion_queue,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3463,6 +3494,7 @@ pub mod harness {
|
||||
self.tenant_id,
|
||||
self.generation,
|
||||
Some(self.remote_storage.clone()),
|
||||
self.deletion_queue.new_client(),
|
||||
));
|
||||
tenant
|
||||
.load(None, ctx)
|
||||
@@ -4193,7 +4225,8 @@ mod tests {
|
||||
//
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
|
||||
let harness = TenantHarness::create("test_bulk_insert")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4240,7 +4273,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
@@ -20,7 +20,10 @@ use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::control_plane_client::ControlPlaneClient;
|
||||
use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
@@ -116,7 +119,23 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
|
||||
Some(client.re_attach().await?)
|
||||
let result = match client.re_attach().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
}
|
||||
};
|
||||
|
||||
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
|
||||
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
|
||||
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
|
||||
// are processed, even though we don't block on recovery completing here.
|
||||
resources
|
||||
.deletion_queue_client
|
||||
.recover(result.clone())
|
||||
.await?;
|
||||
|
||||
Some(result)
|
||||
} else {
|
||||
info!("Control plane API not configured, tenant generations are disabled");
|
||||
None
|
||||
@@ -285,29 +304,21 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = resources.remote_storage {
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
resources.broker_client,
|
||||
tenants,
|
||||
remote_storage,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if resources.remote_storage.is_none() {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(
|
||||
conf,
|
||||
tenant_id,
|
||||
"attaching mark file present but no remote storage configured".to_string(),
|
||||
)
|
||||
} else {
|
||||
match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
@@ -438,8 +449,7 @@ pub async fn create_tenant(
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
resources: TenantSharedResources,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || async {
|
||||
@@ -450,13 +460,9 @@ pub async fn create_tenant(
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let tenant_resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
};
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
|
||||
generation, tenant_resources, None, &TENANTS, ctx)?;
|
||||
generation, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -622,6 +628,7 @@ pub async fn load_tenant(
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || async {
|
||||
@@ -635,6 +642,7 @@ pub async fn load_tenant(
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client
|
||||
};
|
||||
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
|
||||
.with_context(|| {
|
||||
@@ -702,8 +710,7 @@ pub async fn attach_tenant(
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
tenant_conf: TenantConfOpt,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
resources: TenantSharedResources,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || async {
|
||||
@@ -718,10 +725,7 @@ pub async fn attach_tenant(
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage: Some(remote_storage),
|
||||
};
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -116,8 +116,12 @@
|
||||
//! # Completion
|
||||
//!
|
||||
//! Once an operation has completed, we update
|
||||
//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
|
||||
//! to safekeepers that they can delete the WAL up to that LSN.
|
||||
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
|
||||
//! and submit a request through the DeletionQueue to update
|
||||
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
|
||||
//! validated that our generation is not stale. It is this visible value
|
||||
//! that is advertized to safekeepers as a signal that that they can
|
||||
//! delete the WAL up to that LSN.
|
||||
//!
|
||||
//! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
|
||||
//! for all pending operations to complete. It does not prevent more
|
||||
@@ -200,7 +204,6 @@
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
|
||||
storage_impl: GenericRemoteStorage,
|
||||
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
pub fn new(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
|
||||
timeline_id,
|
||||
generation,
|
||||
storage_impl: remote_storage,
|
||||
deletion_queue_client,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||
}
|
||||
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
|
||||
match &*self.upload_queue.lock().unwrap() {
|
||||
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
|
||||
UploadQueue::Stopped(q) => {
|
||||
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
|
||||
}
|
||||
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
|
||||
UploadQueue::Stopped(q) => q
|
||||
.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_projected(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
|
||||
UploadQueue::Stopped(q) => Some(
|
||||
q.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_visible(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
|
||||
/// successfully.
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
names: Vec<LayerFileName>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
|
||||
// Decorate our list of names with each name's generation, dropping
|
||||
// makes that are unexpectedly missing from our metadata.
|
||||
let with_generations: Vec<_> = names
|
||||
.iter()
|
||||
.into_iter()
|
||||
.filter_map(|name| {
|
||||
// Remove from latest_files, learning the file's remote generation in the process
|
||||
let meta = upload_queue.latest_files.remove(name);
|
||||
let meta = upload_queue.latest_files.remove(&name);
|
||||
|
||||
if let Some(meta) = meta {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
for (name, generation) in with_generations {
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: false,
|
||||
generation,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {name}");
|
||||
for (name, gen) in &with_generations {
|
||||
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_generations,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
};
|
||||
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
|
||||
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (mut receiver, deletions_queued) = {
|
||||
let mut deletions_queued = 0;
|
||||
|
||||
let layers: Vec<RemotePath> = {
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = locked.stopped_mut()?;
|
||||
|
||||
@@ -847,42 +862,30 @@ impl RemoteTimelineClient {
|
||||
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.queued_operations
|
||||
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
|
||||
|
||||
// schedule the actual deletions
|
||||
for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: true,
|
||||
generation: meta.generation,
|
||||
});
|
||||
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.queued_operations
|
||||
.push_back(op);
|
||||
|
||||
info!("scheduled layer file deletion {name}");
|
||||
deletions_queued += 1;
|
||||
}
|
||||
|
||||
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
|
||||
|
||||
(
|
||||
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
|
||||
deletions_queued,
|
||||
)
|
||||
.latest_files
|
||||
.drain()
|
||||
.map(|(file_name, meta)| {
|
||||
remote_layer_path(
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&file_name,
|
||||
meta.generation,
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
receiver.changed().await.context("upload queue shut down")?;
|
||||
let layer_deletion_count = layers.len();
|
||||
self.deletion_queue_client.push_immediate(layers).await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
|
||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.deletion_queue_client.flush_immediate().await?;
|
||||
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let not_referenced_count = remaining.len();
|
||||
if !remaining.is_empty() {
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete_objects(&remaining).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
self.deletion_queue_client.push_immediate(remaining).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
debug!("enqueuing index part deletion");
|
||||
self.deletion_queue_client
|
||||
.push_immediate([index_file_path].to_vec())
|
||||
.await?;
|
||||
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete(&index_file_path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||
self.deletion_queue_client.flush_immediate().await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
|
||||
))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
res
|
||||
}
|
||||
UploadOp::Delete(delete) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(delete.layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
delete.file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::Delete(delete) => self
|
||||
.deletion_queue_client
|
||||
.push_layers(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.generation,
|
||||
delete.layers.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e)),
|
||||
UploadOp::Barrier(_) => {
|
||||
// unreachable. Barrier operations are handled synchronously in
|
||||
// launch_queued_tasks
|
||||
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// The task has completed successfully. Remove it from the in-progress list.
|
||||
{
|
||||
let lsn_update = {
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
|
||||
UploadQueue::Stopped(stopped) => {
|
||||
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
|
||||
// then stop() took care of it so we just return.
|
||||
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
|
||||
match &task.op {
|
||||
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
|
||||
_ => None
|
||||
}
|
||||
UploadQueue::Stopped(_stopped) => {
|
||||
None
|
||||
},
|
||||
UploadQueue::Initialized(qi) => { Some(qi) }
|
||||
};
|
||||
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
match task.op {
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
|
||||
// XXX monotonicity check?
|
||||
|
||||
upload_queue.projected_remote_consistent_lsn = Some(lsn);
|
||||
if self.generation.is_none() {
|
||||
// Legacy mode: skip validating generation
|
||||
upload_queue.visible_remote_consistent_lsn.store(lsn);
|
||||
None
|
||||
} else {
|
||||
Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
|
||||
}
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::Barrier(_) => unreachable!(),
|
||||
};
|
||||
|
||||
// Launch any queued tasks that were unblocked by this one.
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
lsn_update
|
||||
};
|
||||
|
||||
if let Some((lsn, slot)) = lsn_update {
|
||||
// Updates to the remote_consistent_lsn we advertise to pageservers
|
||||
// are all routed through the DeletionQueue, to enforce important
|
||||
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
|
||||
self.deletion_queue_client
|
||||
.update_remote_consistent_lsn(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.generation,
|
||||
lsn,
|
||||
slot,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
}
|
||||
|
||||
@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
|
||||
reason: "metadata uploads are tiny",
|
||||
},
|
||||
),
|
||||
UploadOp::Delete(delete) => (
|
||||
delete.file_kind,
|
||||
UploadOp::Delete(_delete) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Delete,
|
||||
DontTrackSize {
|
||||
reason: "should we track deletes? positive or negative sign?",
|
||||
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
|
||||
latest_files: initialized.latest_files.clone(),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: initialized.latest_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
|
||||
projected_remote_consistent_lsn: None,
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_meta: &LayerFileMetadata,
|
||||
generation: Generation,
|
||||
) -> RemotePath {
|
||||
// Generation-aware key format
|
||||
let path = format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||
layer_file_name.file_name(),
|
||||
layer_meta.generation.get_suffix()
|
||||
generation.get_suffix()
|
||||
);
|
||||
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
@@ -1554,7 +1565,6 @@ mod tests {
|
||||
|
||||
impl TestSetup {
|
||||
async fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
// Use a current-thread runtime in the test
|
||||
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
@@ -1580,6 +1590,7 @@ mod tests {
|
||||
timeline_id: TIMELINE_ID,
|
||||
generation,
|
||||
storage_impl: self.harness.remote_storage.clone(),
|
||||
deletion_queue_client: self.harness.deletion_queue.new_client(),
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&self.harness.tenant_id,
|
||||
@@ -1749,7 +1760,7 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
client
|
||||
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
|
||||
.schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
|
||||
.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
@@ -1775,6 +1786,7 @@ mod tests {
|
||||
|
||||
// Finish them
|
||||
client.wait_completion().await.unwrap();
|
||||
harness.deletion_queue.pump().await;
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::{remote_timeline_client::remote_path, Generation},
|
||||
};
|
||||
|
||||
pub(super) async fn delete_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_layer_path: &'a Path,
|
||||
generation: Generation,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
|
||||
|
||||
let path_to_delete = remote_path(conf, local_layer_path, generation)?;
|
||||
|
||||
// We don't want to print an error if the delete failed if the file has
|
||||
// already been deleted. Thankfully, in this situation S3 already
|
||||
// does not yield an error. While OS-provided local file system APIs do yield
|
||||
// errors, we avoid them in the `LocalFs` wrapper.
|
||||
storage
|
||||
.delete(&path_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
|
||||
}
|
||||
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata.generation,
|
||||
);
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
|
||||
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -143,6 +144,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: Option<RemoteTimelineClient>,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -521,9 +523,23 @@ impl Timeline {
|
||||
self.disk_consistent_lsn.load()
|
||||
}
|
||||
|
||||
pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
|
||||
/// remote_consistent_lsn from the perspective of the tenant's current generation,
|
||||
/// not validated with control plane yet.
|
||||
/// See [`Self::get_remote_consistent_lsn_visible`].
|
||||
pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.last_uploaded_consistent_lsn()
|
||||
remote_client.remote_consistent_lsn_projected()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
|
||||
/// i.e. a value of remote_consistent_lsn_projected which has undergone
|
||||
/// generation validation in the deletion queue.
|
||||
pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.remote_consistent_lsn_visible()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -1820,7 +1836,7 @@ impl Timeline {
|
||||
for (layer, m) in needs_upload {
|
||||
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
|
||||
}
|
||||
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
|
||||
rtc.schedule_layer_file_deletion(needs_cleanup)?;
|
||||
rtc.schedule_index_upload_for_file_changes()?;
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
@@ -3875,7 +3891,7 @@ impl Timeline {
|
||||
|
||||
// Also schedule the deletions in remote storage
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4210,7 +4226,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
apply.flush();
|
||||
|
||||
@@ -14,6 +14,7 @@ use utils::{
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
deletion_queue::DeletionQueueClient,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources { remote_client },
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
},
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
|
||||
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
})?;
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn =
|
||||
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
let timeline_remote_consistent_lsn = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let last_received_lsn = last_lsn;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use super::Generation;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
@@ -11,6 +9,7 @@ use std::fmt::Debug;
|
||||
use chrono::NaiveDateTime;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::lsn::AtomicLsn;
|
||||
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
/// Safekeeper can rely on it to make decisions for WAL storage.
|
||||
pub(crate) last_uploaded_consistent_lsn: Lsn,
|
||||
///
|
||||
/// visible_remote_consistent_lsn is only updated after our generation has been validated with
|
||||
/// the control plane (unlesss a timeline's generation is None, in which case
|
||||
/// we skip validation)
|
||||
pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
|
||||
pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
|
||||
|
||||
// Breakdown of different kinds of tasks currently in-progress
|
||||
pub(crate) num_inprogress_layer_uploads: usize,
|
||||
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
|
||||
pub(super) fn no_pending_work(&self) -> bool {
|
||||
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
|
||||
}
|
||||
|
||||
pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
|
||||
self.visible_remote_consistent_lsn.load()
|
||||
}
|
||||
|
||||
pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
self.projected_remote_consistent_lsn
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -114,9 +126,8 @@ impl UploadQueue {
|
||||
latest_files: HashMap::new(),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: metadata.clone(),
|
||||
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
|
||||
// safekeepers from garbage-collecting anything.
|
||||
last_uploaded_consistent_lsn: Lsn(0),
|
||||
projected_remote_consistent_lsn: None,
|
||||
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
@@ -158,7 +169,10 @@ impl UploadQueue {
|
||||
latest_files: files,
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: index_part.metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
|
||||
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
|
||||
visible_remote_consistent_lsn: Arc::new(
|
||||
index_part.metadata.disk_consistent_lsn().into(),
|
||||
),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
|
||||
pub(crate) op: UploadOp,
|
||||
}
|
||||
|
||||
/// A deletion of some layers within the lifetime of a timeline. This is not used
|
||||
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Delete {
|
||||
pub(crate) file_kind: RemoteOpFileKind,
|
||||
pub(crate) layer_file_name: LayerFileName,
|
||||
pub(crate) scheduled_from_timeline_delete: bool,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) layers: Vec<(LayerFileName, Generation)>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a layer file
|
||||
/// Delete layer files
|
||||
Delete(Delete),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
write!(f, "UploadMetadata(lsn: {})", lsn)
|
||||
}
|
||||
UploadOp::Delete(delete) => write!(
|
||||
f,
|
||||
"Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
|
||||
delete.layer_file_name.file_name(),
|
||||
delete.scheduled_from_timeline_delete,
|
||||
delete.generation
|
||||
),
|
||||
UploadOp::Delete(delete) => {
|
||||
write!(f, "Delete({} layers)", delete.layers.len(),)
|
||||
}
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1481,6 +1481,16 @@ class NeonAttachmentService:
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
|
||||
response = requests.post(
|
||||
f"{self.env.control_plane_api}/attach_hook",
|
||||
json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
|
||||
)
|
||||
response.raise_for_status()
|
||||
gen = response.json()["gen"]
|
||||
assert isinstance(gen, int)
|
||||
return gen
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
@@ -1689,12 +1699,7 @@ class NeonPageserver(PgProtocol):
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
if self.env.attachment_service is not None:
|
||||
response = requests.post(
|
||||
f"{self.env.control_plane_api}/attach_hook",
|
||||
json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
|
||||
)
|
||||
response.raise_for_status()
|
||||
generation = response.json()["gen"]
|
||||
generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
|
||||
else:
|
||||
generation = None
|
||||
|
||||
|
||||
@@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session):
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def deletion_queue_flush(self, execute: bool = False):
|
||||
self.put(
|
||||
f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
|
||||
).raise_for_status()
|
||||
|
||||
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
|
||||
|
||||
|
||||
def list_prefix(
|
||||
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
|
||||
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
|
||||
) -> ListObjectsV2OutputTypeDef:
|
||||
"""
|
||||
Note that this function takes into account prefix_in_bucket.
|
||||
@@ -287,7 +287,7 @@ def list_prefix(
|
||||
|
||||
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
|
||||
response = remote.client.list_objects_v2(
|
||||
Delimiter="/",
|
||||
Delimiter=delimiter,
|
||||
Bucket=remote.bucket_name,
|
||||
Prefix=prefix,
|
||||
)
|
||||
|
||||
352
test_runner/regress/test_pageserver_generations.py
Normal file
352
test_runner/regress/test_pageserver_generations.py
Normal file
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
|
||||
Tests in this module exercise the pageserver's behavior around generation numbers,
|
||||
as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require
|
||||
of the pageserver are:
|
||||
- Do not start a tenant without a generation number if control_plane_api is set
|
||||
- Remote objects must be suffixed with generation
|
||||
- Deletions may only be executed after validating generation
|
||||
- Updates to remote_consistent_lsn may only be made visible after validating generation
|
||||
"""
|
||||
|
||||
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import list_prefix
|
||||
from fixtures.remote_storage import (
|
||||
RemoteStorageKind,
|
||||
)
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import print_gc_result, wait_until
|
||||
|
||||
# A tenant configuration that is convenient for generating uploads and deletions
|
||||
# without a large amount of postgres traffic.
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "0s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
}
|
||||
|
||||
|
||||
def generate_uploads_and_deletions(
|
||||
env: NeonEnv,
|
||||
*,
|
||||
init: bool = True,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
timeline_id: Optional[TimelineId] = None,
|
||||
data: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Using the environment's default tenant + timeline, generate a load pattern
|
||||
that results in some uploads and some deletions to remote storage.
|
||||
"""
|
||||
|
||||
if tenant_id is None:
|
||||
tenant_id = env.initial_tenant
|
||||
assert tenant_id is not None
|
||||
|
||||
if timeline_id is None:
|
||||
timeline_id = env.initial_timeline
|
||||
assert timeline_id is not None
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
if init:
|
||||
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
def churn(data):
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
f"""
|
||||
INSERT INTO foo (id, val)
|
||||
SELECT g, '{data}'
|
||||
FROM generate_series(1, 20000) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
# to ensure that GC can actually remove some layers
|
||||
"VACUUM foo",
|
||||
]
|
||||
)
|
||||
assert tenant_id is not None
|
||||
assert timeline_id is not None
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Compaction should generate some GC-elegible layers
|
||||
for i in range(0, 2):
|
||||
churn(f"{i if data is None else data}")
|
||||
|
||||
gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
|
||||
def get_metric_or_0(ps_http, metric: str) -> int:
|
||||
v = ps_http.get_metric_value(metric)
|
||||
return 0 if v is None else int(v)
|
||||
|
||||
|
||||
def get_deletion_queue_executed(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
|
||||
|
||||
|
||||
def get_deletion_queue_submitted(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
|
||||
|
||||
|
||||
def get_deletion_queue_dropped(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
|
||||
|
||||
|
||||
def get_deletion_queue_unexpected_errors(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
|
||||
|
||||
|
||||
def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
|
||||
|
||||
|
||||
def get_deletion_queue_depth(ps_http) -> int:
|
||||
"""
|
||||
Queue depth if at least one deletion has been submitted, else None
|
||||
"""
|
||||
submitted = get_deletion_queue_submitted(ps_http)
|
||||
executed = get_deletion_queue_executed(ps_http)
|
||||
dropped = get_deletion_queue_dropped(ps_http)
|
||||
depth = submitted - executed - dropped
|
||||
log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
|
||||
|
||||
assert depth >= 0
|
||||
return int(depth)
|
||||
|
||||
|
||||
def assert_deletion_queue(ps_http, size_fn) -> None:
|
||||
v = get_deletion_queue_depth(ps_http)
|
||||
assert v is not None
|
||||
assert size_fn(v) is True
|
||||
|
||||
|
||||
def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Validate behavior when a pageserver is run without generation support enabled,
|
||||
then started again after activating it:
|
||||
- Before upgrade, no objects should have generation suffixes
|
||||
- After upgrade, the bucket should contain a mixture.
|
||||
- In both cases, postgres I/O should work.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.try_start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.start()
|
||||
|
||||
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
|
||||
)
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
def parse_generation_suffix(key):
|
||||
m = re.match(".+-([0-9a-zA-Z]{8})$", key)
|
||||
if m is None:
|
||||
return None
|
||||
else:
|
||||
log.info(f"match: {m}")
|
||||
log.info(f"group: {m.group(1)}")
|
||||
return int(m.group(1), 16)
|
||||
|
||||
pre_upgrade_keys = list(
|
||||
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
|
||||
)
|
||||
for key in pre_upgrade_keys:
|
||||
assert parse_generation_suffix(key) is None
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# Starting without the override that disabled control_plane_api
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, init=False)
|
||||
|
||||
legacy_objects: list[str] = []
|
||||
suffixed_objects = []
|
||||
post_upgrade_keys = list(
|
||||
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
|
||||
)
|
||||
for key in post_upgrade_keys:
|
||||
log.info(f"post-upgrade key: {key}")
|
||||
if parse_generation_suffix(key) is not None:
|
||||
suffixed_objects.append(key)
|
||||
else:
|
||||
legacy_objects.append(key)
|
||||
|
||||
# Bucket now contains a mixture of suffixed and non-suffixed objects
|
||||
assert len(suffixed_objects) > 0
|
||||
assert len(legacy_objects) > 0
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
some_other_pageserver = 1234
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
# Flush: pending deletions should all complete
|
||||
assert_deletion_queue(ps_http, lambda n: n > 0)
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert_deletion_queue(ps_http, lambda n: n == 0)
|
||||
assert get_deletion_queue_dropped(ps_http) == 0
|
||||
|
||||
# Our visible remote_consistent_lsn should match projected
|
||||
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||
)
|
||||
|
||||
# Now advance the generation in the control plane: subsequent validations
|
||||
# from the running pageserver will fail. No more deletions should happen.
|
||||
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
|
||||
generate_uploads_and_deletions(env, init=False)
|
||||
|
||||
assert_deletion_queue(ps_http, lambda n: n > 0)
|
||||
queue_depth_before = get_deletion_queue_depth(ps_http)
|
||||
executed_before = get_deletion_queue_executed(ps_http)
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
|
||||
# Queue drains to zero because we dropped deletions
|
||||
assert_deletion_queue(ps_http, lambda n: n == 0)
|
||||
# The executed counter has not incremented
|
||||
assert get_deletion_queue_executed(ps_http) == executed_before
|
||||
# The dropped counter has incremented to consume all of the deletions that were previously enqueued
|
||||
assert get_deletion_queue_dropped(ps_http) == queue_depth_before
|
||||
|
||||
# Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
|
||||
# because generation validation fails.
|
||||
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
|
||||
|
||||
# TODO: list bucket and confirm all objects have a generation suffix.
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_attachment", [True, False])
|
||||
def test_deletion_queue_recovery(
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
|
||||
):
|
||||
"""
|
||||
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
|
||||
node took the attachment while we were restarting.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Prevent deletion lists from being executed, to build up some backlog of deletions
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
("deletion-queue-before-execute", "return"),
|
||||
]
|
||||
)
|
||||
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
# There should be entries in the deletion queue
|
||||
assert_deletion_queue(ps_http, lambda n: n > 0)
|
||||
ps_http.deletion_queue_flush()
|
||||
before_restart_depth = get_deletion_queue_depth(ps_http)
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
if not keep_attachment:
|
||||
some_other_pageserver = 101010
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
def assert_deletions_submitted(n: int):
|
||||
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
|
||||
|
||||
# After restart, issue a flush to kick the deletion frontend to do recovery.
|
||||
# It should recover all the operations we submitted before the restart.
|
||||
ps_http.deletion_queue_flush(execute=False)
|
||||
wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
|
||||
|
||||
# The queue should drain through completely if we flush it
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
|
||||
|
||||
if keep_attachment:
|
||||
# If we kept the attachment, then our pre-restart deletions should have executed
|
||||
# successfully
|
||||
assert get_deletion_queue_executed(ps_http) == before_restart_depth
|
||||
else:
|
||||
# If we lost the attachment, we should have dropped our pre-restart deletions.
|
||||
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
|
||||
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
# Restart again
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
# No deletion lists should be recovered: this demonstrates that deletion lists
|
||||
# were cleaned up after being executed or dropped in the previous process lifetime.
|
||||
time.sleep(1)
|
||||
assert_deletion_queue(ps_http, lambda n: n == 0)
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
@@ -43,6 +43,12 @@ def test_tenant_delete_smoke(
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# The deletion queue will complain when it encounters simulated S3 errors
|
||||
".*deletion executor: DeleteObjects request failed.*",
|
||||
]
|
||||
)
|
||||
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
env.pageserver.allowed_errors.append(
|
||||
@@ -195,6 +201,14 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
]
|
||||
)
|
||||
|
||||
if simulate_failures:
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# The deletion queue will complain when it encounters simulated S3 errors
|
||||
".*deletion executor: DeleteObjects request failed.*",
|
||||
]
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
|
||||
@@ -383,6 +397,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
assert not tenant_path.exists()
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
|
||||
@@ -807,6 +807,8 @@ def test_delete_orphaned_objects(
|
||||
reason = timeline_info["state"]["Broken"]["reason"]
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
|
||||
for orphan in orphans:
|
||||
assert not orphan.exists()
|
||||
assert env.pageserver.log_contains(
|
||||
|
||||
Reference in New Issue
Block a user