mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
remote_storage: don't count 404s as errors (#6201)
## Problem Currently a chart of S3 error rate is misleading: it can show errors any time we are attaching a tenant (probing for index_part generation, checking for remote delete marker). Considering 404 successful isn't perfectly elegant, but it enables the error rate to be used a a more meaningful alert signal: it would indicate if we were having auth issues, sending bad requests, getting throttled ,etc. ## Summary of changes Track 404 requests in the AttemptOutcome::Ok bucket instead of the AttemptOutcome::Err bucket.
This commit is contained in:
@@ -218,14 +218,6 @@ impl S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
@@ -241,11 +233,27 @@ impl S3Bucket {
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
|
||||
if failpoint == "timeline-delete-after-index-delete":
|
||||
m = ps_http.get_metrics()
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "ok"},
|
||||
).value
|
||||
== 1 # index part for initial timeline
|
||||
)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
# this also checks that delete can be retried even when timeline is in Broken state
|
||||
|
||||
Reference in New Issue
Block a user