mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
Fix extreme metrics bloat in storage sync (#2506)
* Fix extreme metrics bloat in storage sync From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per (timeline, tenant) pair, plus another 117 metrics in a global histogram that replaces the previous per-timeline histogram. * Drop image sync operation metric series when dropping TimelineMetrics.
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use metrics::{
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
|
||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
|
||||
UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub static IMAGE_SYNC_TIME: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_remote_storage_image_sync_duration",
|
||||
"Time spent to synchronize (up/download) a whole pageserver image",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register per-timeline pageserver image sync time vec")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
|
||||
pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"];
|
||||
|
||||
pub static IMAGE_SYNC_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_storage_image_sync_count",
|
||||
"Number of synchronization operations executed for pageserver images. \
|
||||
Grouped by tenant, timeline, operation_kind and status",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"]
|
||||
)
|
||||
.expect("failed to register pageserver image sync count vec")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_storage_image_sync_seconds",
|
||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"],
|
||||
Grouped by operation_kind and status",
|
||||
&["operation_kind", "status"],
|
||||
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||
)
|
||||
.expect("failed to register pageserver image sync time histogram vec")
|
||||
@@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets {
|
||||
() => {
|
||||
vec![
|
||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
|
||||
]
|
||||
};
|
||||
}
|
||||
@@ -411,6 +434,14 @@ impl Drop for TimelineMetrics {
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in IMAGE_SYNC_OPERATION_KINDS {
|
||||
for status in IMAGE_SYNC_STATUS {
|
||||
let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]);
|
||||
}
|
||||
}
|
||||
|
||||
let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -178,6 +178,7 @@ use crate::{
|
||||
TenantTimelineValues,
|
||||
};
|
||||
|
||||
use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use self::download::download_index_parts;
|
||||
@@ -835,7 +836,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
upload_data,
|
||||
sync_start,
|
||||
"upload",
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -879,7 +879,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
download_data,
|
||||
sync_start,
|
||||
"download",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
@@ -911,7 +910,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
delete_data,
|
||||
sync_start,
|
||||
"delete",
|
||||
)
|
||||
.instrument(info_span!("delete_timeline_data"))
|
||||
.await;
|
||||
@@ -948,8 +946,9 @@ async fn download_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
new_download_data: SyncData<LayersDownload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> DownloadStatus {
|
||||
static TASK_NAME: &str = "download";
|
||||
|
||||
match download_timeline_layers(
|
||||
conf,
|
||||
storage,
|
||||
@@ -961,19 +960,19 @@ async fn download_timeline_data(
|
||||
.await
|
||||
{
|
||||
DownloadedTimeline::Abort => {
|
||||
register_sync_status(sync_id, sync_start, task_name, None);
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, None);
|
||||
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
||||
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
||||
}
|
||||
}
|
||||
DownloadedTimeline::FailedAndRescheduled => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
}
|
||||
DownloadedTimeline::Successful(mut download_data) => {
|
||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
return DownloadStatus::Downloaded;
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -984,7 +983,7 @@ async fn download_timeline_data(
|
||||
error!("Failed to update local timeline metadata: {e:?}");
|
||||
download_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1060,8 +1059,9 @@ async fn delete_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
mut new_delete_data: SyncData<LayersDeletion>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) {
|
||||
static TASK_NAME: &str = "delete";
|
||||
|
||||
let timeline_delete = &mut new_delete_data.data;
|
||||
|
||||
if !timeline_delete.deletion_registered {
|
||||
@@ -1077,14 +1077,14 @@ async fn delete_timeline_data(
|
||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||
new_delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
return;
|
||||
}
|
||||
}
|
||||
timeline_delete.deletion_registered = true;
|
||||
|
||||
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status));
|
||||
}
|
||||
|
||||
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
||||
@@ -1103,8 +1103,8 @@ async fn upload_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
new_upload_data: SyncData<LayersUpload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> UploadStatus {
|
||||
static TASK_NAME: &str = "upload";
|
||||
let mut uploaded_data = match upload_timeline_layers(
|
||||
storage,
|
||||
sync_queue,
|
||||
@@ -1115,7 +1115,7 @@ async fn upload_timeline_data(
|
||||
.await
|
||||
{
|
||||
UploadedTimeline::FailedAndRescheduled(e) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
return UploadStatus::Failed(e);
|
||||
}
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
@@ -1134,14 +1134,14 @@ async fn upload_timeline_data(
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
UploadStatus::Uploaded
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||
uploaded_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
UploadStatus::Failed(e)
|
||||
}
|
||||
}
|
||||
@@ -1391,16 +1391,22 @@ fn register_sync_status(
|
||||
|
||||
let tenant_id = sync_id.tenant_id.to_string();
|
||||
let timeline_id = sync_id.timeline_id.to_string();
|
||||
match sync_status {
|
||||
Some(true) => {
|
||||
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
|
||||
}
|
||||
Some(false) => {
|
||||
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
|
||||
}
|
||||
None => return,
|
||||
}
|
||||
.observe(secs_elapsed)
|
||||
|
||||
let sync_status = match sync_status {
|
||||
Some(true) => "success",
|
||||
Some(false) => "failure",
|
||||
None => "abort",
|
||||
};
|
||||
|
||||
IMAGE_SYNC_TIME_HISTOGRAM
|
||||
.with_label_values(&[sync_name, sync_status])
|
||||
.observe(secs_elapsed);
|
||||
IMAGE_SYNC_TIME
|
||||
.with_label_values(&[&tenant_id, &timeline_id])
|
||||
.add(secs_elapsed);
|
||||
IMAGE_SYNC_COUNT
|
||||
.with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status])
|
||||
.inc();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user