Files
neon/storage_scrubber/src/checks.rs
Arpad Müller 920040e402 Update storage components to edition 2024 (#10919)
Updates storage components to edition 2024. We like to stay on the
latest edition if possible. There is no functional changes, however some
code changes had to be done to accommodate the edition's breaking
changes.

The PR has two commits:

* the first commit updates storage crates to edition 2024 and appeases
`cargo clippy` by changing code. i have accidentially ran the formatter
on some files that had other edits.
* the second commit performs a `cargo fmt`

I would recommend a closer review of the first commit and a less close
review of the second one (as it just runs `cargo fmt`).

part of https://github.com/neondatabase/neon/issues/10918
2025-02-25 23:51:37 +00:00

682 lines
26 KiB
Rust

use std::collections::{HashMap, HashSet};
use std::time::SystemTime;
use futures_util::StreamExt;
use itertools::Itertools;
use pageserver::tenant::IndexPart;
use pageserver::tenant::checks::check_valid_layermap;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
use pageserver::tenant::remote_timeline_client::{
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
};
use pageserver::tenant::storage_layer::LayerName;
use pageserver_api::shard::ShardIndex;
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
use tokio_util::sync::CancellationToken;
use tracing::{info, warn};
use utils::generation::Generation;
use utils::id::TimelineId;
use utils::shard::TenantShardId;
use crate::cloud_admin_api::BranchData;
use crate::metadata_stream::stream_listing;
use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries};
pub(crate) struct TimelineAnalysis {
/// Anomalies detected
pub(crate) errors: Vec<String>,
/// Healthy-but-noteworthy, like old-versioned structures that are readable but
/// worth reporting for awareness that we must not remove that old version decoding
/// yet.
pub(crate) warnings: Vec<String>,
/// Objects whose keys were not recognized at all, i.e. not layer files, not indices, and not initdb archive.
pub(crate) unknown_keys: Vec<String>,
}
impl TimelineAnalysis {
fn new() -> Self {
Self {
errors: Vec::new(),
warnings: Vec::new(),
unknown_keys: Vec::new(),
}
}
/// Whether a timeline is healthy.
pub(crate) fn is_healthy(&self) -> bool {
self.errors.is_empty() && self.warnings.is_empty()
}
}
pub(crate) async fn branch_cleanup_and_check_errors(
remote_client: &GenericRemoteStorage,
id: &TenantShardTimelineId,
tenant_objects: &mut TenantObjectListing,
s3_active_branch: Option<&BranchData>,
console_branch: Option<BranchData>,
s3_data: Option<RemoteTimelineBlobData>,
) -> TimelineAnalysis {
let mut result = TimelineAnalysis::new();
info!("Checking timeline");
if let Some(s3_active_branch) = s3_active_branch {
info!(
"Checking console status for timeline for branch {:?}/{:?}",
s3_active_branch.project_id, s3_active_branch.id
);
match console_branch {
Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
s3_active_branch.id, s3_active_branch.project_id))
},
None => {
result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
s3_active_branch.id, s3_active_branch.project_id))
}
};
}
match s3_data {
Some(s3_data) => {
result
.unknown_keys
.extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
match s3_data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation: _,
s3_layers: _,
index_part_last_modified_time,
index_part_snapshot_time,
} => {
// Ignore missing file error if index_part downloaded is different from the one when listing the layer files.
let ignore_error = index_part_snapshot_time < index_part_last_modified_time
&& !cfg!(debug_assertions);
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
result
.errors
.push(format!("index_part.json version: {}", index_part.version()))
}
let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
if !newest_versions.any(|ip| ip == &index_part.version()) {
info!(
"index_part.json version is not latest: {}",
index_part.version()
);
}
if index_part.metadata.disk_consistent_lsn()
!= index_part.duplicated_disk_consistent_lsn()
{
// Tech debt: let's get rid of one of these, they are redundant
// https://github.com/neondatabase/neon/issues/8343
result.errors.push(format!(
"Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
index_part.metadata.disk_consistent_lsn(),
index_part.duplicated_disk_consistent_lsn(),
))
}
if index_part.layer_metadata.is_empty() {
if index_part.metadata.ancestor_timeline().is_none() {
// The initial timeline with no ancestor should ALWAYS have layers.
result.errors.push(
"index_part.json has no layers (ancestor_timeline=None)"
.to_string(),
);
} else {
// Not an error, can happen for branches with zero writes, but notice that
info!("index_part.json has no layers (ancestor_timeline exists)");
}
}
let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
result.warnings.push(format!(
"index_part.json contains invalid layer map structure: {err}"
));
}
for (layer, metadata) in index_part.layer_metadata {
if metadata.file_size == 0 {
result.errors.push(format!(
"index_part.json contains a layer {} that has 0 size in its layer metadata", layer,
))
}
if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
let path = remote_layer_path(
&id.tenant_shard_id.tenant_id,
&id.timeline_id,
metadata.shard,
&layer,
metadata.generation,
);
// HEAD request used here to address a race condition when an index was uploaded concurrently
// with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
let response = remote_client
.head_object(&path, &CancellationToken::new())
.await;
if response.is_err() {
// Object is not present.
let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
let msg = format!(
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
layer,
metadata.generation.get_suffix(),
metadata.shard,
is_l0,
);
if is_l0 || ignore_error {
result.warnings.push(msg);
} else {
result.errors.push(msg);
}
}
}
}
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect {
errors,
s3_layers: _,
} => result.errors.extend(
errors
.into_iter()
.map(|error| format!("parse error: {error}")),
),
}
}
None => result
.errors
.push("Timeline has no data on S3 at all".to_string()),
}
if result.errors.is_empty() {
info!("No check errors found");
} else {
warn!("Timeline metadata errors: {0:?}", result.errors);
}
if !result.warnings.is_empty() {
warn!("Timeline metadata warnings: {0:?}", result.warnings);
}
if !result.unknown_keys.is_empty() {
warn!(
"The following keys are not recognized: {0:?}",
result.unknown_keys
)
}
result
}
#[derive(Default)]
pub(crate) struct LayerRef {
ref_count: usize,
}
/// Top-level index of objects in a tenant. This may be used by any shard-timeline within
/// the tenant to query whether an object exists.
#[derive(Default)]
pub(crate) struct TenantObjectListing {
shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>,
}
impl TenantObjectListing {
/// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
/// list of layer keys for the Tenant.
pub(crate) fn push(
&mut self,
ttid: TenantShardTimelineId,
layers: HashSet<(LayerName, Generation)>,
) {
let shard_index = ShardIndex::new(
ttid.tenant_shard_id.shard_number,
ttid.tenant_shard_id.shard_count,
);
let replaced = self.shard_timelines.insert(
(shard_index, ttid.timeline_id),
layers
.into_iter()
.map(|l| (l, LayerRef::default()))
.collect(),
);
assert!(
replaced.is_none(),
"Built from an S3 object listing, which should never repeat a key"
);
}
/// Having loaded a timeline index, check if a layer referenced by the index exists. If it does,
/// the layer's refcount will be incremented. Later, after calling this for all references in all indices
/// in a tenant, orphan layers may be detected by their zero refcounts.
///
/// Returns true if the layer exists
pub(crate) fn check_ref(
&mut self,
timeline_id: TimelineId,
layer_file: &LayerName,
metadata: &LayerFileMetadata,
) -> bool {
let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
return false;
};
let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
return false;
};
layer_ref.ref_count += 1;
true
}
pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> {
let mut result = Vec::new();
for ((shard_index, timeline_id), layers) in &self.shard_timelines {
for ((layer_file, generation), layer_ref) in layers {
if layer_ref.ref_count == 0 {
result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
}
}
}
result
}
}
#[derive(Debug)]
pub(crate) struct RemoteTimelineBlobData {
pub(crate) blob_data: BlobDataParseResult,
/// Index objects that were not used when loading `blob_data`, e.g. those from old generations
pub(crate) unused_index_keys: Vec<ListingObject>,
/// Objects whose keys were not recognized at all, i.e. not layer files, not indices
pub(crate) unknown_keys: Vec<ListingObject>,
}
#[derive(Debug)]
pub(crate) enum BlobDataParseResult {
Parsed {
index_part: Box<IndexPart>,
index_part_generation: Generation,
index_part_last_modified_time: SystemTime,
index_part_snapshot_time: SystemTime,
s3_layers: HashSet<(LayerName, Generation)>,
},
/// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index)
Relic,
Incorrect {
errors: Vec<String>,
s3_layers: HashSet<(LayerName, Generation)>,
},
}
pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
match name.rsplit_once('-') {
// FIXME: this is gross, just use a regex?
Some((layer_filename, gen_)) if gen_.len() == 8 => {
let layer = layer_filename.parse::<LayerName>()?;
let gen_ =
Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?;
Ok((layer, gen_))
}
_ => Ok((name.parse::<LayerName>()?, Generation::none())),
}
}
/// Note (<https://github.com/neondatabase/neon/issues/8872>):
/// Since we do not gurantee the order of the listing, we could list layer keys right before
/// pageserver `RemoteTimelineClient` deletes the layer files and then the index.
/// In the rare case, this would give back a transient error where the index key is missing.
///
/// To avoid generating false positive, we try streaming the listing for a second time.
pub(crate) async fn list_timeline_blobs(
remote_client: &GenericRemoteStorage,
id: TenantShardTimelineId,
root_target: &RootTarget,
) -> anyhow::Result<RemoteTimelineBlobData> {
let res = list_timeline_blobs_impl(remote_client, id, root_target).await?;
match res {
ListTimelineBlobsResult::Ready(data) => Ok(data),
ListTimelineBlobsResult::MissingIndexPart(_) => {
// Retry if listing raced with removal of an index
let data = list_timeline_blobs_impl(remote_client, id, root_target)
.await?
.into_data();
Ok(data)
}
}
}
enum ListTimelineBlobsResult {
/// Blob data is ready to be intepreted.
Ready(RemoteTimelineBlobData),
/// The listing contained an index but when we tried to fetch it, we couldn't
MissingIndexPart(RemoteTimelineBlobData),
}
impl ListTimelineBlobsResult {
/// Get the inner blob data regardless the status.
pub fn into_data(self) -> RemoteTimelineBlobData {
match self {
ListTimelineBlobsResult::Ready(data) => data,
ListTimelineBlobsResult::MissingIndexPart(data) => data,
}
}
}
/// Returns [`ListTimelineBlobsResult::MissingIndexPart`] if blob data has layer files
/// but is missing [`IndexPart`], otherwise returns [`ListTimelineBlobsResult::Ready`].
async fn list_timeline_blobs_impl(
remote_client: &GenericRemoteStorage,
id: TenantShardTimelineId,
root_target: &RootTarget,
) -> anyhow::Result<ListTimelineBlobsResult> {
let mut s3_layers = HashSet::new();
let mut errors = Vec::new();
let mut unknown_keys = Vec::new();
let mut timeline_dir_target = root_target.timeline_root(&id);
timeline_dir_target.delimiter = String::new();
let mut index_part_keys: Vec<ListingObject> = Vec::new();
let mut initdb_archive: bool = false;
let prefix_str = &timeline_dir_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&timeline_dir_target.prefix_in_bucket);
let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
while let Some(obj) = stream.next().await {
let (key, Some(obj)) = obj? else {
panic!("ListingObject not specified");
};
let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
match blob_name {
Some(name) if name.starts_with("index_part.json") => {
tracing::debug!("Index key {key}");
index_part_keys.push(obj)
}
Some("initdb.tar.zst") => {
tracing::debug!("initdb archive {key}");
initdb_archive = true;
}
Some("initdb-preserved.tar.zst") => {
tracing::info!("initdb archive preserved {key}");
}
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
Ok((new_layer, gen_)) => {
tracing::debug!("Parsed layer key: {new_layer} {gen_:?}");
s3_layers.insert((new_layer, gen_));
}
Err(e) => {
tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
unknown_keys.push(obj);
}
},
None => {
tracing::info!("S3 listed an unknown key: {key}");
unknown_keys.push(obj);
}
}
}
if index_part_keys.is_empty() && s3_layers.is_empty() {
tracing::debug!("Timeline is empty: expected post-deletion state.");
if initdb_archive {
tracing::info!("Timeline is post deletion but initdb archive is still present.");
}
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
blob_data: BlobDataParseResult::Relic,
unused_index_keys: index_part_keys,
unknown_keys,
}));
}
// Choose the index_part with the highest generation
let (index_part_object, index_part_generation) = match index_part_keys
.iter()
.filter_map(|key| {
// Stripping the index key to the last part, because RemotePath doesn't
// like absolute paths, and depending on prefix_in_bucket it's possible
// for the keys we read back to start with a slash.
let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
})
.max_by_key(|i| i.1)
.map(|(k, g)| (k.clone(), g))
{
Some((key, gen_)) => (Some::<ListingObject>(key.to_owned()), gen_),
None => {
// Legacy/missing case: one or zero index parts, which did not have a generation
(index_part_keys.pop(), Generation::none())
}
};
match index_part_object.as_ref() {
Some(selected) => index_part_keys.retain(|k| k != selected),
None => {
// This case does not indicate corruption, but it should be very unusual. It can
// happen if:
// - timeline creation is in progress (first layer is written before index is written)
// - timeline deletion happened while a stale pageserver was still attached, it might upload
// a layer after the deletion is done.
tracing::info!(
"S3 list response got no index_part.json file but still has layer files"
);
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
blob_data: BlobDataParseResult::Relic,
unused_index_keys: index_part_keys,
unknown_keys,
}));
}
}
if let Some(index_part_object_key) = index_part_object.as_ref() {
let (index_part_bytes, index_part_last_modified_time) =
match download_object_with_retries(remote_client, &index_part_object_key.key).await {
Ok(data) => data,
Err(e) => {
// It is possible that the branch gets deleted in-between we list the objects
// and we download the index part file.
errors.push(format!("failed to download index_part.json: {e}"));
return Ok(ListTimelineBlobsResult::MissingIndexPart(
RemoteTimelineBlobData {
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
unused_index_keys: index_part_keys,
unknown_keys,
},
));
}
};
let index_part_snapshot_time = index_part_object_key.last_modified;
match serde_json::from_slice(&index_part_bytes) {
Ok(index_part) => {
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
blob_data: BlobDataParseResult::Parsed {
index_part: Box::new(index_part),
index_part_generation,
s3_layers,
index_part_last_modified_time,
index_part_snapshot_time,
},
unused_index_keys: index_part_keys,
unknown_keys,
}));
}
Err(index_parse_error) => errors.push(format!(
"index_part.json body parsing error: {index_parse_error}"
)),
}
}
if errors.is_empty() {
errors.push(
"Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
);
}
Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
unused_index_keys: index_part_keys,
unknown_keys,
}))
}
pub(crate) struct RemoteTenantManifestInfo {
pub(crate) generation: Generation,
pub(crate) manifest: TenantManifest,
pub(crate) listing_object: ListingObject,
}
pub(crate) enum ListTenantManifestResult {
WithErrors {
errors: Vec<(String, String)>,
#[allow(dead_code)]
unknown_keys: Vec<ListingObject>,
},
NoErrors {
latest_generation: Option<RemoteTenantManifestInfo>,
manifests: Vec<(Generation, ListingObject)>,
},
}
/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
pub(crate) async fn list_tenant_manifests(
remote_client: &GenericRemoteStorage,
tenant_id: TenantShardId,
root_target: &RootTarget,
) -> anyhow::Result<ListTenantManifestResult> {
let mut errors = Vec::new();
let mut unknown_keys = Vec::new();
let mut tenant_root_target = root_target.tenant_root(&tenant_id);
let original_prefix = tenant_root_target.prefix_in_bucket.clone();
const TENANT_MANIFEST_STEM: &str = "tenant-manifest";
tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM;
tenant_root_target.delimiter = String::new();
let mut manifests: Vec<(Generation, ListingObject)> = Vec::new();
let prefix_str = &original_prefix
.strip_prefix("/")
.unwrap_or(&original_prefix);
let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target));
'outer: while let Some(obj) = stream.next().await {
let (key, Some(obj)) = obj? else {
panic!("ListingObject not specified");
};
'err: {
// TODO a let chain would be nicer here.
let Some(name) = key.object_name() else {
break 'err;
};
if !name.starts_with(TENANT_MANIFEST_STEM) {
break 'err;
}
let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else {
break 'err;
};
tracing::debug!("tenant manifest {key}");
manifests.push((generation, obj));
continue 'outer;
}
tracing::info!("Listed an unknown key: {key}");
unknown_keys.push(obj);
}
if !unknown_keys.is_empty() {
errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}
if manifests.is_empty() {
tracing::debug!("No manifest for timeline.");
return Ok(ListTenantManifestResult::NoErrors {
latest_generation: None,
manifests,
});
}
// Find the manifest with the highest generation
let (latest_generation, latest_listing_object) = manifests
.iter()
.max_by_key(|i| i.0)
.map(|(g, obj)| (*g, obj.clone()))
.unwrap();
manifests.retain(|(gen_, _obj)| gen_ != &latest_generation);
let manifest_bytes =
match download_object_with_retries(remote_client, &latest_listing_object.key).await {
Ok((bytes, _)) => bytes,
Err(e) => {
// It is possible that the tenant gets deleted in-between we list the objects
// and we download the manifest file.
errors.push((
latest_listing_object.key.get_path().as_str().to_owned(),
format!("failed to download tenant-manifest.json: {e}"),
));
return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}
};
match TenantManifest::from_json_bytes(&manifest_bytes) {
Ok(manifest) => {
return Ok(ListTenantManifestResult::NoErrors {
latest_generation: Some(RemoteTenantManifestInfo {
generation: latest_generation,
manifest,
listing_object: latest_listing_object,
}),
manifests,
});
}
Err(parse_error) => errors.push((
latest_listing_object.key.get_path().as_str().to_owned(),
format!("tenant-manifest.json body parsing error: {parse_error}"),
)),
}
if errors.is_empty() {
errors.push((
(*prefix_str).to_owned(),
"Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
));
}
Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
})
}