mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-13 08:22:55 +00:00
## Problem Previously, the controller only used the shard counts for scheduling. This works well when hosting only many-sharded tenants, but works much less well when hosting single-sharded tenants that have a greater deviation in size-per-shard. Closes: https://github.com/neondatabase/neon/issues/7798 ## Summary of changes - Instead of UtilizationScore, carry the full PageserverUtilization through into the Scheduler. - Use the PageserverUtilization::score() instead of shard count when ordering nodes in scheduling. Q: Why did test_sharding_split_smoke need updating in this PR? A: There's an interesting side effect during shard splits: because we do not decrement the shard count in the utilization when we de-schedule the shards from before the split, the controller will now prefer to pick _different_ nodes for shards compared with which ones held secondaries before the split. We could use our knowledge of splitting to fix up the utilizations more actively in this situation, but I'm leaning toward leaving the code simpler, as in practical systems the impact of one shard on the utilization of a node should be fairly low (single digit %).
71 lines
2.3 KiB
Rust
71 lines
2.3 KiB
Rust
//! An utilization metric which is used to decide on which pageserver to put next tenant.
|
|
//!
|
|
//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
|
|
//! truth.
|
|
|
|
use anyhow::Context;
|
|
use std::path::Path;
|
|
use utils::serde_percent::Percent;
|
|
|
|
use pageserver_api::models::PageserverUtilization;
|
|
|
|
use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
|
|
|
|
pub(crate) fn regenerate(
|
|
conf: &PageServerConf,
|
|
tenants_path: &Path,
|
|
tenant_manager: &TenantManager,
|
|
) -> anyhow::Result<PageserverUtilization> {
|
|
let statvfs = nix::sys::statvfs::statvfs(tenants_path)
|
|
.map_err(std::io::Error::from)
|
|
.context("statvfs tenants directory")?;
|
|
|
|
// https://unix.stackexchange.com/a/703650
|
|
let blocksz = if statvfs.fragment_size() > 0 {
|
|
statvfs.fragment_size()
|
|
} else {
|
|
statvfs.block_size()
|
|
};
|
|
|
|
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
|
let free = statvfs.blocks_available() as u64 * blocksz;
|
|
|
|
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
|
let used = statvfs
|
|
.blocks()
|
|
// use blocks_free instead of available here to match df in case someone compares
|
|
.saturating_sub(statvfs.blocks_free()) as u64
|
|
* blocksz;
|
|
|
|
let captured_at = std::time::SystemTime::now();
|
|
|
|
// Calculate aggregate utilization from tenants on this pageserver
|
|
let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
|
|
|
|
// Fetch the fraction of disk space which may be used
|
|
let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
|
|
Some(e) => e.max_usage_pct,
|
|
None => Percent::new(100).unwrap(),
|
|
};
|
|
|
|
// Express a static value for how many shards we may schedule on one node
|
|
const MAX_SHARDS: u32 = 20000;
|
|
|
|
let mut doc = PageserverUtilization {
|
|
disk_usage_bytes: used,
|
|
free_space_bytes: free,
|
|
disk_wanted_bytes,
|
|
disk_usable_pct,
|
|
shard_count,
|
|
max_shard_count: MAX_SHARDS,
|
|
utilization_score: None,
|
|
captured_at: utils::serde_system_time::SystemTime(captured_at),
|
|
};
|
|
|
|
// Initialize `PageserverUtilization::utilization_score`
|
|
let score = doc.cached_score();
|
|
NODE_UTILIZATION_SCORE.set(score);
|
|
|
|
Ok(doc)
|
|
}
|