Fix extreme metrics bloat in storage sync (#2506)

* Fix extreme metrics bloat in storage sync

From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per
(timeline, tenant) pair, plus another 117 metrics in a global histogram that
replaces the previous per-timeline histogram.

* Drop image sync operation metric series when dropping TimelineMetrics.
This commit is contained in:
MMeent
2022-09-23 14:35:36 +02:00
committed by GitHub
parent 3e65209a06
commit bc3ba23e0a
2 changed files with 69 additions and 32 deletions

View File

@@ -1,8 +1,9 @@
use metrics::core::{AtomicU64, GenericCounter};
use metrics::{
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
UIntGaugeVec,
};
use once_cell::sync::Lazy;
use utils::id::{TenantId, TimelineId};
@@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
.expect("failed to register pageserver remote storage remaining sync items int gauge")
});
pub static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
pub static IMAGE_SYNC_TIME: Lazy<GaugeVec> = Lazy::new(|| {
register_gauge_vec!(
"pageserver_remote_storage_image_sync_duration",
"Time spent to synchronize (up/download) a whole pageserver image",
&["tenant_id", "timeline_id"],
)
.expect("failed to register per-timeline pageserver image sync time vec")
});
pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"];
pub static IMAGE_SYNC_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_remote_storage_image_sync_count",
"Number of synchronization operations executed for pageserver images. \
Grouped by tenant, timeline, operation_kind and status",
&["tenant_id", "timeline_id", "operation_kind", "status"]
)
.expect("failed to register pageserver image sync count vec")
});
pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_storage_image_sync_seconds",
"Time took to synchronize (download or upload) a whole pageserver image. \
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
&["tenant_id", "timeline_id", "operation_kind", "status"],
Grouped by operation_kind and status",
&["operation_kind", "status"],
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
)
.expect("failed to register pageserver image sync time histogram vec")
@@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets {
() => {
vec![
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
]
};
}
@@ -411,6 +434,14 @@ impl Drop for TimelineMetrics {
for op in SMGR_QUERY_TIME_OPERATIONS {
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in IMAGE_SYNC_OPERATION_KINDS {
for status in IMAGE_SYNC_STATUS {
let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]);
}
}
let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]);
}
}

View File

@@ -178,6 +178,7 @@ use crate::{
TenantTimelineValues,
};
use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use self::download::download_index_parts;
@@ -835,7 +836,6 @@ async fn process_sync_task_batch(
sync_id,
upload_data,
sync_start,
"upload",
)
.await
}
@@ -879,7 +879,6 @@ async fn process_sync_task_batch(
sync_id,
download_data,
sync_start,
"download",
)
.await;
}
@@ -911,7 +910,6 @@ async fn process_sync_task_batch(
sync_id,
delete_data,
sync_start,
"delete",
)
.instrument(info_span!("delete_timeline_data"))
.await;
@@ -948,8 +946,9 @@ async fn download_timeline_data(
sync_id: TenantTimelineId,
new_download_data: SyncData<LayersDownload>,
sync_start: Instant,
task_name: &str,
) -> DownloadStatus {
static TASK_NAME: &str = "download";
match download_timeline_layers(
conf,
storage,
@@ -961,19 +960,19 @@ async fn download_timeline_data(
.await
{
DownloadedTimeline::Abort => {
register_sync_status(sync_id, sync_start, task_name, None);
register_sync_status(sync_id, sync_start, TASK_NAME, None);
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
}
}
DownloadedTimeline::FailedAndRescheduled => {
register_sync_status(sync_id, sync_start, task_name, Some(false));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
}
DownloadedTimeline::Successful(mut download_data) => {
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
Ok(()) => {
register_sync_status(sync_id, sync_start, task_name, Some(true));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
return DownloadStatus::Downloaded;
}
Err(e) => {
@@ -984,7 +983,7 @@ async fn download_timeline_data(
error!("Failed to update local timeline metadata: {e:?}");
download_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Download(download_data));
register_sync_status(sync_id, sync_start, task_name, Some(false));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
}
}
}
@@ -1060,8 +1059,9 @@ async fn delete_timeline_data(
sync_id: TenantTimelineId,
mut new_delete_data: SyncData<LayersDeletion>,
sync_start: Instant,
task_name: &str,
) {
static TASK_NAME: &str = "delete";
let timeline_delete = &mut new_delete_data.data;
if !timeline_delete.deletion_registered {
@@ -1077,14 +1077,14 @@ async fn delete_timeline_data(
error!("Failed to update remote timeline {sync_id}: {e:?}");
new_delete_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
register_sync_status(sync_id, sync_start, task_name, Some(false));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
return;
}
}
timeline_delete.deletion_registered = true;
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status));
}
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
@@ -1103,8 +1103,8 @@ async fn upload_timeline_data(
sync_id: TenantTimelineId,
new_upload_data: SyncData<LayersUpload>,
sync_start: Instant,
task_name: &str,
) -> UploadStatus {
static TASK_NAME: &str = "upload";
let mut uploaded_data = match upload_timeline_layers(
storage,
sync_queue,
@@ -1115,7 +1115,7 @@ async fn upload_timeline_data(
.await
{
UploadedTimeline::FailedAndRescheduled(e) => {
register_sync_status(sync_id, sync_start, task_name, Some(false));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
return UploadStatus::Failed(e);
}
UploadedTimeline::Successful(upload_data) => upload_data,
@@ -1134,14 +1134,14 @@ async fn upload_timeline_data(
.await
{
Ok(()) => {
register_sync_status(sync_id, sync_start, task_name, Some(true));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
UploadStatus::Uploaded
}
Err(e) => {
error!("Failed to update remote timeline {sync_id}: {e:?}");
uploaded_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
register_sync_status(sync_id, sync_start, task_name, Some(false));
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
UploadStatus::Failed(e)
}
}
@@ -1391,16 +1391,22 @@ fn register_sync_status(
let tenant_id = sync_id.tenant_id.to_string();
let timeline_id = sync_id.timeline_id.to_string();
match sync_status {
Some(true) => {
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
}
Some(false) => {
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
}
None => return,
}
.observe(secs_elapsed)
let sync_status = match sync_status {
Some(true) => "success",
Some(false) => "failure",
None => "abort",
};
IMAGE_SYNC_TIME_HISTOGRAM
.with_label_values(&[sync_name, sync_status])
.observe(secs_elapsed);
IMAGE_SYNC_TIME
.with_label_values(&[&tenant_id, &timeline_id])
.add(secs_elapsed);
IMAGE_SYNC_COUNT
.with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status])
.inc();
}
#[cfg(test)]