mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 23:20:40 +00:00
Remote storage metrics (#4892)
We don't know how our s3 remote_storage is performing, or if it's blocking the shutdown. Well, for sampling reasons, we will not really know even after this PR. Add metrics: - align remote_storage metrics towards #4813 goals - histogram `remote_storage_s3_request_seconds{request_type=(get_object|put_object|delete_object|list_objects), result=(ok|err|cancelled)}` - histogram `remote_storage_s3_wait_seconds{request_type=(same kinds)}` - counter `remote_storage_s3_cancelled_waits_total{request_type=(same kinds)}` Follow-up work: - After release, remove the old metrics, migrate dashboards Histogram buckets are rough guesses, need to be tuned. In pageserver we have a download timeout of 120s, so I think the 100s bucket is quite nice.
This commit is contained in:
@@ -476,7 +476,7 @@ class NeonEnvBuilder:
|
||||
|
||||
# Prepare the default branch to start the postgres on later.
|
||||
# Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
|
||||
log.info(
|
||||
log.debug(
|
||||
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
|
||||
)
|
||||
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
|
||||
|
||||
@@ -272,6 +272,23 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
wait_timeline_detail_404(
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
|
||||
if failpoint == "timeline-delete-after-index-delete":
|
||||
m = ps_http.get_metrics()
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "err"},
|
||||
).value
|
||||
== 1
|
||||
)
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "ok"},
|
||||
).value
|
||||
== 1
|
||||
)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
# this also checks that delete can be retried even when timeline is in Broken state
|
||||
|
||||
Reference in New Issue
Block a user