From ed9ffb9af2ce30eff88e9c6fcfe0c315d69e025b Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 18 Jun 2024 13:44:30 +0100 Subject: [PATCH] pageserver: eliminate CalculateSyntheticSizeError::LsnNotFound (`test_metric_collection` flake) (#8065) ## Problem ``` ERROR synthetic_size_worker: failed to calculate synthetic size for tenant ae449af30216ac56d2c1173f894b1122: Could not find size at 0/218CA70 in timeline d8da32b5e3e0bf18cfdb560f9de29638\n') ``` e.g. https://neon-github-public-dev.s3.amazonaws.com/reports/main/9518948590/index.html#/testresult/30a6d1e2471d2775 This test had allow lists but was disrupted by https://github.com/neondatabase/neon/pull/8051. In that PR, I had kept an error path in fill_logical_sizes that covered the case where we couldn't find sizes for some of the segments, but that path could only be hit in the case that some Timeline was shut down concurrently with a synthetic size calculation, so it makes sense to just leave the segment's size None in this case: the subsequent size calculations do not assume it is Some. ## Summary of changes - Remove `CalculateSyntheticSizeError::LsnNotFound` and just proceed in the case where we used to return it - Remove defunct allow list entries in `test_metric_collection` --- pageserver/src/tenant/size.rs | 11 ++--------- .../regress/test_pageserver_metric_collection.py | 6 ------ 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index cdd5b0cbe7..b2338b620e 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -60,10 +60,6 @@ pub(crate) enum CalculateSyntheticSizeError { #[error(transparent)] Fatal(anyhow::Error), - /// The LSN we are trying to calculate a size at no longer exists at the point we query it - #[error("Could not find size at {lsn} in timeline {timeline_id}")] - LsnNotFound { timeline_id: TimelineId, lsn: Lsn }, - /// Tenant shut down while calculating size #[error("Cancelled")] Cancelled, @@ -375,9 +371,8 @@ pub(super) async fn gather_inputs( /// Augment 'segments' with logical sizes /// -/// this will probably conflict with on-demand downloaded layers, or at least force them all -/// to be downloaded -/// +/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently +/// (i.e. we cannot read its logical size at a particular LSN). async fn fill_logical_sizes( timelines: &[Arc], segments: &mut [SegmentMeta], @@ -498,8 +493,6 @@ async fn fill_logical_sizes( if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { seg.segment.size = Some(*size); - } else { - return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn }); } } Ok(()) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index b0465f2a96..cea35a6acb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -75,9 +75,6 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", ] ) @@ -238,9 +235,6 @@ def test_metric_collection_cleans_up_tempfile( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", ] )