mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 00:20:37 +00:00
This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). The motivation is to avoid locking `Tenant::timelines` in places that can't be `async`, because in #4333 we want to convert Tenant::timelines from `std::sync::Mutex` to `tokio::sync::Mutex`. But, the changes here are useful in general because they clean up & document tenant state transitions. That also paves the way for #4350, which is an alternative to #4333 that refactors the pageserver code so that we can keep the `Tenant::timelines` mutex sync. This patch consists of the following core insights and changes: * spawn_load and spawn_attach own the tenant state until they're done * once load()/attach() calls are done ... * if they failed, transition them to Broken directly (we know that there's no background activity because we didn't call activate yet) * if they succeed, call activate. We can make it infallible. How? Later. * set_broken() and set_stopping() are changed to wait for spawn_load() / spawn_attach() to finish. * This sounds scary because it might hinder detach or shutdown, but actually, concurrent attach+detach, or attach+shutdown, or load+shutdown, or attach+shutdown were just racy before this PR. So, with this change, they're not anymore. In the future, we can add a `CancellationToken` stored in Tenant to cancel `load` and `attach` faster, i.e., make `spawn_load` / `spawn_attach` transition them to Broken state sooner. See the doc comments on TenantState for the state transitions that are now possible. It might seem scary, but actually, this patch reduces the possible state transitions. We introduce a new state `TenantState::Activating` to avoid grabbing the `Tenant::timelines` lock inside the `send_modify` closure. These were the humble beginnings of this PR (see Motivation section), and I think it's still the right thing to have this `Activating` state, even if we decide against async `Tenant::timelines` mutex. The reason is that `send_modify` locks internally, and by moving locking of Tenant::timelines out of the closure, the internal locking of `send_modify` becomes a leaf of the lock graph, and so, we eliminate deadlock risk. Co-authored-by: Joonas Koivunen <joonas@neon.tech>
171 lines
5.8 KiB
Rust
171 lines
5.8 KiB
Rust
mod auth;
|
|
pub mod basebackup;
|
|
pub mod config;
|
|
pub mod consumption_metrics;
|
|
pub mod context;
|
|
pub mod disk_usage_eviction_task;
|
|
pub mod http;
|
|
pub mod import_datadir;
|
|
pub mod keyspace;
|
|
pub(crate) mod metrics;
|
|
pub mod page_cache;
|
|
pub mod page_service;
|
|
pub mod pgdatadir_mapping;
|
|
pub mod repository;
|
|
pub(crate) mod statvfs;
|
|
pub mod task_mgr;
|
|
pub mod tenant;
|
|
pub mod trace;
|
|
pub mod virtual_file;
|
|
pub mod walingest;
|
|
pub mod walrecord;
|
|
pub mod walredo;
|
|
|
|
use std::path::Path;
|
|
|
|
use crate::task_mgr::TaskKind;
|
|
use tracing::info;
|
|
|
|
/// Current storage format version
|
|
///
|
|
/// This is embedded in the header of all the layer files.
|
|
/// If you make any backwards-incompatible changes to the storage
|
|
/// format, bump this!
|
|
/// Note that TimelineMetadata uses its own version number to track
|
|
/// backwards-compatible changes to the metadata format.
|
|
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
|
|
|
pub const DEFAULT_PG_VERSION: u32 = 15;
|
|
|
|
// Magic constants used to identify different kinds of files
|
|
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
|
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
|
|
|
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
|
|
|
pub use crate::metrics::preinitialize_metrics;
|
|
|
|
#[tracing::instrument]
|
|
pub async fn shutdown_pageserver(exit_code: i32) {
|
|
// Shut down the libpq endpoint task. This prevents new connections from
|
|
// being accepted.
|
|
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
|
|
|
// Shut down any page service tasks.
|
|
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
|
|
|
// Shut down all the tenants. This flushes everything to disk and kills
|
|
// the checkpoint and GC tasks.
|
|
tenant::mgr::shutdown_all_tenants().await;
|
|
|
|
// Stop syncing with remote storage.
|
|
//
|
|
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
|
// Should it?
|
|
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
|
|
|
|
// Shut down the HTTP endpoint last, so that you can still check the server's
|
|
// status while it's shutting down.
|
|
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
|
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
|
|
|
// There should be nothing left, but let's be sure
|
|
task_mgr::shutdown_tasks(None, None, None).await;
|
|
info!("Shut down successfully completed");
|
|
std::process::exit(exit_code);
|
|
}
|
|
|
|
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
|
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
|
|
|
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
|
let backoff_duration_seconds =
|
|
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
|
if backoff_duration_seconds > 0.0 {
|
|
info!(
|
|
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
|
);
|
|
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
|
}
|
|
}
|
|
|
|
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
|
if n == 0 {
|
|
0.0
|
|
} else {
|
|
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
|
}
|
|
}
|
|
|
|
/// The name of the metadata file pageserver creates per timeline.
|
|
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
|
pub const METADATA_FILE_NAME: &str = "metadata";
|
|
|
|
/// Per-tenant configuration file.
|
|
/// Full path: `tenants/<tenant_id>/config`.
|
|
pub const TENANT_CONFIG_NAME: &str = "config";
|
|
|
|
/// A suffix used for various temporary files. Any temporary files found in the
|
|
/// data directory at pageserver startup can be automatically removed.
|
|
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
|
|
|
/// A marker file to mark that a timeline directory was not fully initialized.
|
|
/// If a timeline directory with this marker is encountered at pageserver startup,
|
|
/// the timeline directory and the marker file are both removed.
|
|
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
|
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
|
|
|
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
|
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
|
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
|
/// into pageserver's memory before being ignored.
|
|
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
|
|
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
|
|
|
|
pub fn is_temporary(path: &Path) -> bool {
|
|
match path.file_name() {
|
|
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
|
|
None => false,
|
|
}
|
|
}
|
|
|
|
pub fn is_uninit_mark(path: &Path) -> bool {
|
|
match path.file_name() {
|
|
Some(name) => name
|
|
.to_string_lossy()
|
|
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
|
|
None => false,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod backoff_defaults_tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn backoff_defaults_produce_growing_backoff_sequence() {
|
|
let mut current_backoff_value = None;
|
|
|
|
for i in 0..10_000 {
|
|
let new_backoff_value = exponential_backoff_duration_seconds(
|
|
i,
|
|
DEFAULT_BASE_BACKOFF_SECONDS,
|
|
DEFAULT_MAX_BACKOFF_SECONDS,
|
|
);
|
|
|
|
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
|
assert!(
|
|
old_backoff_value <= new_backoff_value,
|
|
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
|
)
|
|
}
|
|
}
|
|
|
|
assert_eq!(
|
|
current_backoff_value.expect("Should have produced backoff values to compare"),
|
|
DEFAULT_MAX_BACKOFF_SECONDS,
|
|
"Given big enough of retries, backoff should reach its allowed max value"
|
|
);
|
|
}
|
|
}
|