mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
pageserver: exclude un-read layers from short residence statistic (#8396)
## Problem The `evictions_with_low_residence_duration` is used as an indicator of cache thrashing. However, there are situations where it is quite legitimate to only have a short residence during compaction, where a delta is downloaded, used to generate an image layer, and then discarded. This can lead to false positive alerts. ## Summary of changes - Only track low residence duration for layers that have been accessed at least once (compaction doesn't count as an access). This will give us a metric that indicates thrashing on layers that the _user_ is using, rather than those we're downloading for housekeeping purposes. Once we add "layer visibility" as an explicit property of layers, this can also be used as a cleaner condition (residence of non-visible layers should never be alertable)
This commit is contained in:
@@ -676,6 +676,26 @@ impl LayerAccessStats {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
|
||||
///
|
||||
/// This indicates whether the layer has been used for some purpose that would motivate
|
||||
/// us to keep it on disk, such as for serving a getpage request.
|
||||
fn accessed(&self) -> bool {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
|
||||
// Consider it accessed if the most recent access is more recent than
|
||||
// the most recent change in residence status.
|
||||
match (
|
||||
inner.last_accesses.recent(),
|
||||
inner.last_residence_changes.recent(),
|
||||
) {
|
||||
(None, _) => false,
|
||||
(Some(_), None) => true,
|
||||
(Some(a), Some(r)) => a.when >= r.timestamp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
|
||||
@@ -1469,14 +1469,22 @@ impl LayerInner {
|
||||
let duration = SystemTime::now().duration_since(local_layer_mtime);
|
||||
match duration {
|
||||
Ok(elapsed) => {
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
let accessed = self.access_stats.accessed();
|
||||
if accessed {
|
||||
// Only layers used for reads contribute to our "low residence" metric that is used
|
||||
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
|
||||
// to be rapidly evicted without contributing to this metric.
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
residence_millis = elapsed.as_millis(),
|
||||
accessed,
|
||||
"evicted layer after known residence period"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
|
||||
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -265,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
(tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# When we evict/download layers, we will use this Workload to generate getpage requests
|
||||
# that touch some layers, as otherwise the pageserver doesn't report totally unused layers
|
||||
# as problems when they have short residence duration.
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(100)
|
||||
|
||||
def get_metric():
|
||||
metrics = ps_http.get_metrics()
|
||||
metric = metrics.query_one(
|
||||
@@ -285,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
assert default_value == "1day"
|
||||
|
||||
ps_http.download_all_layers(tenant_id, timeline_id)
|
||||
workload.validate()
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
metric = get_metric()
|
||||
assert int(metric.value) > 0, "metric is updated"
|
||||
@@ -305,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
assert int(metric.value) == 0
|
||||
|
||||
ps_http.download_all_layers(tenant_id, timeline_id)
|
||||
workload.validate()
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
metric = get_metric()
|
||||
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
|
||||
@@ -318,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
assert int(metric.value) == 0, "value resets if label changes"
|
||||
|
||||
ps_http.download_all_layers(tenant_id, timeline_id)
|
||||
workload.validate()
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
metric = get_metric()
|
||||
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
|
||||
|
||||
Reference in New Issue
Block a user