mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 21:50:37 +00:00
Part of https://github.com/neondatabase/neon/issues/7462 ## Summary of changes Tenant config is not persisted unless it's attached on the storage controller. In this pull request, we persist the aux file policy flag in the `index_part.json`. Admins can set `switch_aux_file_policy` in the storage controller or using the page server API. Upon the first aux file gets written, the write path will compare the aux file policy target with the current policy. If it is switch-able, we will do the switch. Otherwise, the original policy will be used. The test cases show what the admins can do / cannot do. The `last_aux_file_policy` is stored in `IndexPart`. Updates to the persisted policy are done via `schedule_index_upload_for_aux_file_policy_update`. On the write path, the writer will update the field. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Joonas Koivunen <joonas@neon.tech>
353 lines
12 KiB
Rust
353 lines
12 KiB
Rust
//! A helper tool to manage pageserver binary files.
|
|
//! Accepts a file as an argument, attempts to parse it with all ways possible
|
|
//! and prints its interpreted context.
|
|
//!
|
|
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
|
|
|
mod draw_timeline_dir;
|
|
mod index_part;
|
|
mod layer_map_analyzer;
|
|
mod layers;
|
|
|
|
use std::{
|
|
str::FromStr,
|
|
time::{Duration, SystemTime},
|
|
};
|
|
|
|
use camino::{Utf8Path, Utf8PathBuf};
|
|
use clap::{Parser, Subcommand};
|
|
use index_part::IndexPartCmd;
|
|
use layers::LayerCmd;
|
|
use pageserver::{
|
|
context::{DownloadBehavior, RequestContext},
|
|
page_cache,
|
|
task_mgr::TaskKind,
|
|
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
|
virtual_file,
|
|
};
|
|
use pageserver_api::shard::TenantShardId;
|
|
use postgres_ffi::ControlFileData;
|
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
|
use tokio_util::sync::CancellationToken;
|
|
use utils::{
|
|
id::TimelineId,
|
|
logging::{self, LogFormat, TracingErrorLayerEnablement},
|
|
lsn::Lsn,
|
|
project_git_version,
|
|
};
|
|
|
|
project_git_version!(GIT_VERSION);
|
|
|
|
#[derive(Parser)]
|
|
#[command(
|
|
version = GIT_VERSION,
|
|
about = "Neon Pageserver binutils",
|
|
long_about = "Reads pageserver (and related) binary files management utility"
|
|
)]
|
|
#[command(propagate_version = true)]
|
|
struct CliOpts {
|
|
#[command(subcommand)]
|
|
command: Commands,
|
|
}
|
|
|
|
#[derive(Subcommand)]
|
|
enum Commands {
|
|
Metadata(MetadataCmd),
|
|
#[command(subcommand)]
|
|
IndexPart(IndexPartCmd),
|
|
PrintLayerFile(PrintLayerFileCmd),
|
|
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
|
|
DrawTimeline {},
|
|
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
|
#[command(subcommand)]
|
|
Layer(LayerCmd),
|
|
}
|
|
|
|
/// Read and update pageserver metadata file
|
|
#[derive(Parser)]
|
|
struct MetadataCmd {
|
|
/// Input metadata file path
|
|
metadata_path: Utf8PathBuf,
|
|
/// Replace disk consistent Lsn
|
|
disk_consistent_lsn: Option<Lsn>,
|
|
/// Replace previous record Lsn
|
|
prev_record_lsn: Option<Lsn>,
|
|
/// Replace latest gc cuttoff
|
|
latest_gc_cuttoff: Option<Lsn>,
|
|
}
|
|
|
|
#[derive(Parser)]
|
|
struct PrintLayerFileCmd {
|
|
/// Pageserver data path
|
|
path: Utf8PathBuf,
|
|
}
|
|
|
|
/// Roll back the time for the specified prefix using S3 history.
|
|
///
|
|
/// The command is fairly low level and powerful. Validation is only very light,
|
|
/// so it is more powerful, and thus potentially more dangerous.
|
|
#[derive(Parser)]
|
|
struct TimeTravelRemotePrefixCmd {
|
|
/// A configuration string for the remote_storage configuration.
|
|
///
|
|
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
|
|
config_toml_str: String,
|
|
/// remote prefix to time travel recover. For safety reasons, we require it to contain
|
|
/// a timeline or tenant ID in the prefix.
|
|
prefix: String,
|
|
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
|
|
travel_to: String,
|
|
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
|
|
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
|
|
done_if_after: Option<String>,
|
|
}
|
|
|
|
#[derive(Parser)]
|
|
struct AnalyzeLayerMapCmd {
|
|
/// Pageserver data path
|
|
path: Utf8PathBuf,
|
|
/// Max holes
|
|
max_holes: Option<usize>,
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> anyhow::Result<()> {
|
|
logging::init(
|
|
LogFormat::Plain,
|
|
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
|
logging::Output::Stdout,
|
|
)?;
|
|
|
|
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
|
|
|
let cli = CliOpts::parse();
|
|
|
|
match cli.command {
|
|
Commands::Layer(cmd) => {
|
|
layers::main(&cmd).await?;
|
|
}
|
|
Commands::Metadata(cmd) => {
|
|
handle_metadata(&cmd)?;
|
|
}
|
|
Commands::IndexPart(cmd) => {
|
|
index_part::main(&cmd).await?;
|
|
}
|
|
Commands::DrawTimeline {} => {
|
|
draw_timeline_dir::main()?;
|
|
}
|
|
Commands::AnalyzeLayerMap(cmd) => {
|
|
layer_map_analyzer::main(&cmd).await?;
|
|
}
|
|
Commands::PrintLayerFile(cmd) => {
|
|
if let Err(e) = read_pg_control_file(&cmd.path) {
|
|
println!(
|
|
"Failed to read input file as a pg control one: {e:#}\n\
|
|
Attempting to read it as layer file"
|
|
);
|
|
print_layerfile(&cmd.path).await?;
|
|
}
|
|
}
|
|
Commands::TimeTravelRemotePrefix(cmd) => {
|
|
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
|
|
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
|
|
|
|
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
|
|
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
|
|
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
|
|
})?
|
|
} else {
|
|
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
|
|
tokio::time::sleep(SAFETY_MARGIN).await;
|
|
// Convert to string representation and back to get rid of sub-second values
|
|
let done_if_after = SystemTime::now();
|
|
tokio::time::sleep(SAFETY_MARGIN).await;
|
|
done_if_after
|
|
};
|
|
|
|
let timestamp = strip_subsecond(timestamp);
|
|
let done_if_after = strip_subsecond(done_if_after);
|
|
|
|
let Some(prefix) = validate_prefix(&cmd.prefix) else {
|
|
println!("specified prefix '{}' failed validation", cmd.prefix);
|
|
return Ok(());
|
|
};
|
|
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
|
let toml_item = toml_document
|
|
.get("remote_storage")
|
|
.expect("need remote_storage");
|
|
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
|
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
|
let cancel = CancellationToken::new();
|
|
storage
|
|
.unwrap()
|
|
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
|
.await?;
|
|
}
|
|
};
|
|
Ok(())
|
|
}
|
|
|
|
fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
|
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
|
|
println!("{control_file:?}");
|
|
let control_file_initdb = Lsn(control_file.checkPoint);
|
|
println!(
|
|
"pg_initdb_lsn: {}, aligned: {}",
|
|
control_file_initdb,
|
|
control_file_initdb.align()
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
|
// Basic initialization of things that don't change after startup
|
|
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
|
page_cache::init(100);
|
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
|
dump_layerfile_from_path(path, true, &ctx).await
|
|
}
|
|
|
|
fn handle_metadata(
|
|
MetadataCmd {
|
|
metadata_path: path,
|
|
disk_consistent_lsn,
|
|
prev_record_lsn,
|
|
latest_gc_cuttoff,
|
|
}: &MetadataCmd,
|
|
) -> Result<(), anyhow::Error> {
|
|
let metadata_bytes = std::fs::read(path)?;
|
|
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
|
println!("Current metadata:\n{meta:?}");
|
|
let mut update_meta = false;
|
|
// TODO: simplify this part
|
|
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
|
meta = TimelineMetadata::new(
|
|
*disk_consistent_lsn,
|
|
meta.prev_record_lsn(),
|
|
meta.ancestor_timeline(),
|
|
meta.ancestor_lsn(),
|
|
meta.latest_gc_cutoff_lsn(),
|
|
meta.initdb_lsn(),
|
|
meta.pg_version(),
|
|
);
|
|
update_meta = true;
|
|
}
|
|
if let Some(prev_record_lsn) = prev_record_lsn {
|
|
meta = TimelineMetadata::new(
|
|
meta.disk_consistent_lsn(),
|
|
Some(*prev_record_lsn),
|
|
meta.ancestor_timeline(),
|
|
meta.ancestor_lsn(),
|
|
meta.latest_gc_cutoff_lsn(),
|
|
meta.initdb_lsn(),
|
|
meta.pg_version(),
|
|
);
|
|
update_meta = true;
|
|
}
|
|
if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
|
|
meta = TimelineMetadata::new(
|
|
meta.disk_consistent_lsn(),
|
|
meta.prev_record_lsn(),
|
|
meta.ancestor_timeline(),
|
|
meta.ancestor_lsn(),
|
|
*latest_gc_cuttoff,
|
|
meta.initdb_lsn(),
|
|
meta.pg_version(),
|
|
);
|
|
update_meta = true;
|
|
}
|
|
|
|
if update_meta {
|
|
let metadata_bytes = meta.to_bytes()?;
|
|
std::fs::write(path, metadata_bytes)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Ensures that the given S3 prefix is sufficiently constrained.
|
|
/// The command is very risky already and we don't want to expose something
|
|
/// that allows usually unintentional and quite catastrophic time travel of
|
|
/// an entire bucket, which would be a major catastrophy and away
|
|
/// by only one character change (similar to "rm -r /home /username/foobar").
|
|
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
|
|
if prefix.is_empty() {
|
|
// Empty prefix means we want to specify the *whole* bucket
|
|
return None;
|
|
}
|
|
let components = prefix.split('/').collect::<Vec<_>>();
|
|
let (last, components) = {
|
|
let last = components.last()?;
|
|
if last.is_empty() {
|
|
(
|
|
components.iter().nth_back(1)?,
|
|
&components[..(components.len() - 1)],
|
|
)
|
|
} else {
|
|
(last, &components[..])
|
|
}
|
|
};
|
|
'valid: {
|
|
if let Ok(_timeline_id) = TimelineId::from_str(last) {
|
|
// Ends in either a tenant or timeline ID
|
|
break 'valid;
|
|
}
|
|
if *last == "timelines" {
|
|
if let Some(before_last) = components.iter().nth_back(1) {
|
|
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
|
|
// Has a valid tenant id
|
|
break 'valid;
|
|
}
|
|
}
|
|
}
|
|
|
|
return None;
|
|
}
|
|
RemotePath::from_string(prefix).ok()
|
|
}
|
|
|
|
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
|
|
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
|
|
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_validate_prefix() {
|
|
assert_eq!(validate_prefix(""), None);
|
|
assert_eq!(validate_prefix("/"), None);
|
|
#[track_caller]
|
|
fn assert_valid(prefix: &str) {
|
|
let remote_path = RemotePath::from_string(prefix).unwrap();
|
|
assert_eq!(validate_prefix(prefix), Some(remote_path));
|
|
}
|
|
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
|
|
// Path is not relative but absolute
|
|
assert_eq!(
|
|
validate_prefix(
|
|
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
|
|
),
|
|
None
|
|
);
|
|
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
|
|
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
|
|
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
|
|
assert_eq!(validate_prefix("wal"), None);
|
|
assert_eq!(validate_prefix("/wal/"), None);
|
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
|
|
// Partial tenant ID
|
|
assert_eq!(
|
|
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
|
|
None
|
|
);
|
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
|
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
|
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
|
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
|
|
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
|
|
}
|
|
}
|