mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
libs: tweak PageserverUtilization::is_overloaded (#8946)
## Problem Having run in production for a while, we see that nodes are generally safely oversubscribed by about a factor of 2. ## Summary of changes Tweak the is_overloaded method to check for utililzation over 200% rather than over 100%
This commit is contained in:
@@ -89,8 +89,19 @@ impl PageserverUtilization {
|
||||
|
||||
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
||||
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
||||
///
|
||||
/// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
|
||||
/// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
|
||||
pub fn is_overloaded(score: RawScore) -> bool {
|
||||
score >= Self::UTILIZATION_FULL
|
||||
// Why the factor of two? This is unscientific but reflects behavior of real systems:
|
||||
// - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
|
||||
// startup and housekeeping jobs nice and responsive. We can go to double this limit if needed
|
||||
// until some more nodes are deployed.
|
||||
// - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
|
||||
// hold its biggest timeline fully on disk, which is tends to be an over estimate when
|
||||
// some tenants are very idle and have dropped layers from disk. In practice going up to
|
||||
// double is generally better than giving up and scheduling in a sub-optimal AZ.
|
||||
score >= 2 * Self::UTILIZATION_FULL
|
||||
}
|
||||
|
||||
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
||||
|
||||
Reference in New Issue
Block a user