Compare commits

..

15 Commits

Author SHA1 Message Date
Konstantin Knizhnik
1c4bed27be Resolve merge conflicts 2025-04-16 08:20:46 +03:00
Konstantin Knizhnik
fdf0f1bdc0 Fix rust formatting 2025-04-16 07:49:47 +03:00
Konstantin Knizhnik
0bdd388dd8 Make it possible to control lazy_sru_download through tenant config 2025-04-16 07:49:47 +03:00
Konstantin Knizhnik
712b4cf83c Update compute_tools/src/compute.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-04-16 07:49:46 +03:00
Konstantin Knizhnik
15b6bb5026 Update libs/compute_api/src/spec.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-04-16 07:49:44 +03:00
Konstantin Knizhnik
61d642e541 Update pageserver/src/page_service.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-04-16 07:48:59 +03:00
Konstantin Knizhnik
1d24b887b8 Refector construction of basebackup command 2025-04-16 07:48:58 +03:00
Konstantin Knizhnik
955175c791 Make clippy happy 2025-04-16 07:48:58 +03:00
Konstantin Knizhnik
5fb0bcdd6a Make clippy happy 2025-04-16 07:48:58 +03:00
Konstantin Knizhnik
f146fa86f8 Use lazy SLRU download for all timelines is feature flag is set 2025-04-16 07:48:56 +03:00
Konstantin Knizhnik
961008116b Use lazy SLRU download for all timelines is feature flag is set 2025-04-16 07:47:55 +03:00
Konstantin Knizhnik
42d2d3addc Fix checking lazy SLRU download condition 2025-04-16 07:45:35 +03:00
Konstantin Knizhnik
06d0bed566 Always update lazy_slru_download flag during basebackup 2025-04-16 07:45:35 +03:00
Konstantin Knizhnik
aa367e5d82 Add lazy_slru_download_threshold parameter to page server config 2025-04-16 07:45:33 +03:00
Konstantin Knizhnik
6b76e1c526 Add lazy_slru_download compute feature flag 2025-04-16 07:42:32 +03:00
77 changed files with 6216 additions and 848 deletions

3
Cargo.lock generated
View File

@@ -1432,6 +1432,7 @@ dependencies = [
"pageserver_api",
"pageserver_client",
"pem",
"pkcs8 0.10.2",
"postgres_backend",
"postgres_connection",
"regex",
@@ -1441,7 +1442,6 @@ dependencies = [
"serde",
"serde_json",
"sha2",
"spki 0.7.3",
"storage_broker",
"thiserror 1.0.69",
"tokio",
@@ -8469,6 +8469,7 @@ dependencies = [
"once_cell",
"p256 0.13.2",
"parquet",
"pkcs8 0.10.2",
"prettyplease",
"proc-macro2",
"prost 0.13.3",

View File

@@ -143,6 +143,7 @@ parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
pin-project-lite = "0.2"
pkcs8 = "0.10.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
@@ -175,7 +176,6 @@ signal-hook = "0.3"
smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"

View File

@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index b667478..1298aa1 100644
index b667478..dc95d89 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -36,7 +36,7 @@ index b667478..1298aa1 100644
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
@@ -48,17 +48,32 @@ index b667478..1298aa1 100644
BuildGraph(buildstate, forkNum);
- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+ if (set_lwlsn_block_range_hook)
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ if (set_lwlsn_relation_hook)
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
+ }
+
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}

View File

@@ -1,5 +1,5 @@
diff --git a/src/ruminsert.c b/src/ruminsert.c
index 255e616..1c6edb7 100644
index 255e616..7a2240f 100644
--- a/src/ruminsert.c
+++ b/src/ruminsert.c
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -24,12 +24,24 @@ index 255e616..1c6edb7 100644
/*
* Write index to xlog
*/
@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
UnlockReleaseBuffer(buffer);
}
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(index->rd_smgr);
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+ if (set_lwlsn_block_range_hook)
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ if (set_lwlsn_relation_hook)
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+
+ smgr_end_unlogged_build(index->rd_smgr);
+ }
+#endif
+
/*

View File

@@ -897,32 +897,28 @@ impl ComputeNode {
let mut client = config.connect(NoTls)?;
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
Lsn(0) => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id
)
} else {
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
}
}
_ => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id, lsn
)
} else {
format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
)
}
}
};
let tenant_id = spec.tenant_id.to_string();
let timeline_id = spec.timeline_id.to_string();
let lsn_str = lsn.to_string();
let mut cmd = Vec::new();
cmd.push("basebackup");
cmd.push(&tenant_id);
cmd.push(&timeline_id);
if lsn != Lsn::INVALID {
cmd.push(&lsn_str);
}
cmd.push("--gzip");
if spec.spec.mode != ComputeMode::Primary {
cmd.push("--replica");
}
if spec
.spec
.features
.contains(&ComputeFeature::LazySlruDownload)
{
cmd.push("--lazy-slru-download")
}
let basebackup_cmd = cmd.join(" ");
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
let mut measured_reader = MeasuredReader::new(copyreader);
let mut bufreader = std::io::BufReader::new(&mut measured_reader);

View File

@@ -1,7 +1,7 @@
use std::collections::HashSet;
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
@@ -64,6 +64,19 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await

View File

@@ -16,6 +16,7 @@ jsonwebtoken.workspace = true
nix.workspace = true
once_cell.workspace = true
pem.workspace = true
pkcs8.workspace = true
humantime-serde.workspace = true
hyper0.workspace = true
regex.workspace = true
@@ -24,7 +25,6 @@ scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
spki.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true

View File

@@ -60,12 +60,11 @@ use jsonwebtoken::jwk::{
use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use pem::Pem;
use pkcs8::der::Decode;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -148,12 +147,11 @@ impl ComputeControlPlane {
/// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
/// from the production control plane.
fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
let public_key = spki.subject_public_key.raw_bytes();
fn create_jwks_from_pem(pem: Pem) -> Result<JwkSet> {
let document = pkcs8::Document::from_der(&pem.into_contents())?;
let mut hasher = Sha256::new();
hasher.update(public_key);
hasher.update(&document);
let key_hash = hasher.finalize();
Ok(JwkSet {
@@ -171,7 +169,7 @@ impl ComputeControlPlane {
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
key_type: OctetKeyPairType::OctetKeyPair,
curve: EllipticCurve::Ed25519,
x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
x: base64::encode_config(&document, base64::URL_SAFE_NO_PAD),
}),
}],
})
@@ -195,7 +193,7 @@ impl ComputeControlPlane {
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
let compute_ctl_config = ComputeCtlConfig {
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
jwks: Self::create_jwks_from_pem(self.env.read_public_key()?)?,
tls: None::<TlsConfig>,
};
let ep = Arc::new(Endpoint {

View File

@@ -413,11 +413,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
compaction_shard_ancestor: settings
.remove("compaction_shard_ancestor")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
compaction_l0_first: settings
.remove("compaction_l0_first")
.map(|x| x.parse::<bool>())

View File

@@ -183,6 +183,9 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Download all SLRU files on demand
LazySlruDownload,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
/// `parse_unknown_features()` for more details.

View File

@@ -76,7 +76,14 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
mfs
}
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"libmetrics_disk_io_bytes_total",
"Bytes written and read from disk, grouped by the operation (read|write)",
&["io_operation"]
)
.expect("Failed to register disk i/o bytes int gauge vec")
});
static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
@@ -254,7 +261,12 @@ const BYTES_IN_BLOCK: i64 = 512;
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();
DISK_IO_BYTES
.with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
DISK_IO_BYTES
.with_label_values(&["write"])
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
// On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
#[cfg(target_os = "macos")]
@@ -345,7 +357,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
}
impl<P: Atomic> GenericCounterPair<P> {

View File

@@ -379,8 +379,6 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, enable shard ancestor compaction (enabled by default).
pub compaction_shard_ancestor: bool,
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
pub compaction_l0_first: bool,
@@ -679,7 +677,6 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
// This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
// 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
@@ -737,7 +734,6 @@ impl Default for TenantConfigToml {
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
l0_flush_delay_threshold: None,

View File

@@ -526,8 +526,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_shard_ancestor: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_first: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_semaphore: FieldPatch<bool>,
@@ -617,9 +615,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_shard_ancestor: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub compaction_l0_first: Option<bool>,
@@ -729,7 +724,6 @@ impl TenantConfig {
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_shard_ancestor,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
@@ -778,9 +772,6 @@ impl TenantConfig {
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch
.compaction_shard_ancestor
.apply(&mut compaction_shard_ancestor);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
@@ -869,7 +860,6 @@ impl TenantConfig {
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
compaction_shard_ancestor,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
@@ -930,9 +920,6 @@ impl TenantConfig {
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
compaction_shard_ancestor: self
.compaction_shard_ancestor
.unwrap_or(global_conf.compaction_shard_ancestor),
compaction_l0_first: self
.compaction_l0_first
.unwrap_or(global_conf.compaction_l0_first),

View File

@@ -23,6 +23,7 @@ use futures::future::Either;
use futures::stream::Stream;
use futures_util::{StreamExt, TryStreamExt};
use http_types::{StatusCode, Url};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use utils::backoff;
@@ -31,7 +32,7 @@ use utils::backoff::exponential_backoff_duration_seconds;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use crate::config::AzureConfig;
use crate::error::Cancelled;
use crate::metrics::RequestKind;
use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
use crate::{
ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -164,7 +165,7 @@ impl AzureBlobStorage {
let mut last_modified = None;
let mut metadata = HashMap::new();
let started_at = start_measuring_requests(kind);
let download = async {
let response = builder
@@ -236,8 +237,19 @@ impl AzureBlobStorage {
TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
},
};
let started_at = ScopeGuard::into_inner(started_at);
let outcome = match &download {
Ok(_) => AttemptOutcome::Ok,
// At this level in the stack 404 and 304 responses do not indicate an error.
// There's expected cases when a blob may not exist or hasn't been modified since
// the last get (e.g. probing for timeline indices and heatmap downloads).
// Callers should handle errors if they are unexpected.
Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
download
}
@@ -419,7 +431,7 @@ impl RemoteStorage for AzureBlobStorage {
let kind = RequestKind::Head;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
let properties_future = blob_client.get_properties().into_future();
@@ -431,9 +443,12 @@ impl RemoteStorage for AzureBlobStorage {
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
if let Ok(_inner) = &res {
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
let data = match res {
@@ -461,7 +476,7 @@ impl RemoteStorage for AzureBlobStorage {
let kind = RequestKind::Put;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let op = async {
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -494,7 +509,14 @@ impl RemoteStorage for AzureBlobStorage {
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let outcome = match res {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
res
}
@@ -540,7 +562,7 @@ impl RemoteStorage for AzureBlobStorage {
) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let op = async {
// TODO batch requests are not supported by the SDK
@@ -606,8 +628,10 @@ impl RemoteStorage for AzureBlobStorage {
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
}
@@ -623,7 +647,7 @@ impl RemoteStorage for AzureBlobStorage {
) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let timeout = tokio::time::sleep(self.timeout);
@@ -677,8 +701,10 @@ impl RemoteStorage for AzureBlobStorage {
},
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
}

View File

@@ -1,7 +1,9 @@
use metrics::{
Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
};
use once_cell::sync::Lazy;
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
#[derive(Clone, Copy, Debug)]
pub(crate) enum RequestKind {
@@ -14,9 +16,62 @@ pub(crate) enum RequestKind {
Head = 6,
}
use RequestKind::*;
use scopeguard::ScopeGuard;
impl RequestKind {
const fn as_str(&self) -> &'static str {
match self {
Get => "get_object",
Put => "put_object",
Delete => "delete_object",
List => "list_objects",
Copy => "copy_object",
TimeTravel => "time_travel_recover",
Head => "head_object",
}
}
const fn as_index(&self) -> usize {
*self as usize
}
}
const REQUEST_KIND_COUNT: usize = 7;
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
impl<C> RequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind) -> &C {
&self.0[kind.as_index()]
}
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)
});
if let Some(next) = it.next() {
panic!("unexpected {next:?}");
}
RequestTyped(arr)
}
}
impl RequestTyped<Histogram> {
pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
self.get(kind).observe(started_at.elapsed().as_secs_f64())
}
}
pub(crate) struct PassFailCancelledRequestTyped<C> {
success: RequestTyped<C>,
fail: RequestTyped<C>,
cancelled: RequestTyped<C>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum AttemptOutcome {
@@ -34,22 +89,138 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
}
}
impl AttemptOutcome {
pub(crate) fn as_str(&self) -> &'static str {
match self {
AttemptOutcome::Ok => "ok",
AttemptOutcome::Err => "err",
AttemptOutcome::Cancelled => "cancelled",
}
}
}
impl<C> PassFailCancelledRequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
let target = match outcome {
AttemptOutcome::Ok => &self.success,
AttemptOutcome::Err => &self.fail,
AttemptOutcome::Cancelled => &self.cancelled,
};
target.get(kind)
}
fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
PassFailCancelledRequestTyped {
success,
fail,
cancelled,
}
}
}
impl PassFailCancelledRequestTyped<Histogram> {
pub(crate) fn observe_elapsed(
&self,
kind: RequestKind,
outcome: impl Into<AttemptOutcome>,
started_at: std::time::Instant,
) {
self.get(kind, outcome.into())
.observe(started_at.elapsed().as_secs_f64())
}
}
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
pub(crate) fn start_measuring_requests(
_kind: RequestKind,
/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
pub(crate) fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_started_at| {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
crate::metrics::BUCKET_METRICS
.cancelled_waits
.get(kind)
.inc()
})
}
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
pub(crate) fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
pub(crate) struct BucketMetrics {
/// Full request duration until successful completion, error or cancellation.
pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
/// Total amount of seconds waited on queue.
pub(crate) wait_seconds: RequestTyped<Histogram>,
/// Track how many semaphore awaits were cancelled per request type.
///
/// This is in case cancellations are happening more than expected.
pub(crate) cancelled_waits: RequestTyped<IntCounter>,
/// Total amount of deleted objects in batches or single requests.
pub(crate) deleted_objects_total: IntCounter,
}
impl Default for BucketMetrics {
fn default() -> Self {
// first bucket 100 microseconds to count requests that do not need to wait at all
// and get a permit immediately
let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
let req_seconds = register_histogram_vec!(
"remote_storage_s3_request_seconds",
"Seconds to complete a request",
&["request_type", "result"],
buckets.to_vec(),
)
.unwrap();
let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
});
let wait_seconds = register_histogram_vec!(
"remote_storage_s3_wait_seconds",
"Seconds rate limited",
&["request_type"],
buckets.to_vec(),
)
.unwrap();
let wait_seconds =
RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
let cancelled_waits = register_int_counter_vec!(
"remote_storage_s3_cancelled_waits_total",
"Times a semaphore wait has been cancelled per request type",
&["request_type"],
)
.unwrap();
let cancelled_waits =
RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
let deleted_objects_total = register_int_counter!(
"remote_storage_s3_deleted_objects_total",
"Amount of deleted objects in total",
)
.unwrap();
Self {
req_seconds,
wait_seconds,
cancelled_waits,
deleted_objects_total,
}
}
}

View File

@@ -41,7 +41,7 @@ use super::StorageMetadata;
use crate::config::S3Config;
use crate::error::Cancelled;
pub(super) use crate::metrics::RequestKind;
use crate::metrics::{AttemptOutcome, start_measuring_requests};
use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
use crate::support::PermitCarrying;
use crate::{
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
@@ -199,7 +199,7 @@ impl S3Bucket {
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
let started_at = start_counting_cancelled_wait(kind);
let acquire = self.concurrency_limiter.acquire(kind);
let permit = tokio::select! {
@@ -207,8 +207,10 @@ impl S3Bucket {
_ = cancel.cancelled() => return Err(Cancelled),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
}
@@ -218,7 +220,7 @@ impl S3Bucket {
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
let started_at = start_counting_cancelled_wait(kind);
let acquire = self.concurrency_limiter.acquire_owned(kind);
let permit = tokio::select! {
@@ -226,8 +228,10 @@ impl S3Bucket {
_ = cancel.cancelled() => return Err(Cancelled),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
}
@@ -269,7 +273,11 @@ impl S3Bucket {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
return Err(DownloadError::NotFound);
}
Err(SdkError::ServiceError(e))
@@ -279,11 +287,19 @@ impl S3Bucket {
if e.raw().status().as_u16() == StatusCode::NotModified =>
{
// Count an unmodified file as a success.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
return Err(DownloadError::Unmodified);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
return Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
@@ -330,11 +346,11 @@ impl S3Bucket {
delete_objects: &[ObjectIdentifier],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let mut cancel = std::pin::pin!(cancel.cancelled());
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
let started_at = start_measuring_requests(kind);
let req = self
.client
@@ -354,10 +370,15 @@ impl S3Bucket {
_ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp.context("request deletion")?;
crate::metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
@@ -424,8 +445,8 @@ pin_project_lite::pin_project! {
}
impl<S> PinnedDrop for TimedDownload<S> {
fn drop(mut _this: Pin<&mut Self>) {
fn drop(mut this: Pin<&mut Self>) {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
}
}
}
@@ -490,7 +511,7 @@ impl RemoteStorage for S3Bucket {
let mut continuation_token = None;
'outer: loop {
let started_at = start_measuring_requests(kind);
// min of two Options, returning Some if one is value and another is
// None (None is smaller than anything, so plain min doesn't work).
@@ -523,9 +544,11 @@ impl RemoteStorage for S3Bucket {
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
let response = match response {
Ok(response) => response,
@@ -606,7 +629,7 @@ impl RemoteStorage for S3Bucket {
let kind = RequestKind::Head;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let head_future = self
.client
@@ -625,18 +648,30 @@ impl RemoteStorage for S3Bucket {
let res = res.map_err(|_e| DownloadError::Timeout)?;
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
let data = match res {
Ok(object_output) => object_output,
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
return Err(DownloadError::NotFound);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
return Err(DownloadError::Other(
anyhow::Error::new(e).context("s3 head object"),
@@ -669,7 +704,7 @@ impl RemoteStorage for S3Bucket {
let kind = RequestKind::Put;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
@@ -692,10 +727,12 @@ impl RemoteStorage for S3Bucket {
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
if let Ok(_inner) = &res {
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
match res {
@@ -716,7 +753,7 @@ impl RemoteStorage for S3Bucket {
let timeout = tokio::time::sleep(self.timeout);
let started_at = start_measuring_requests(kind);
// we need to specify bucket_name as a prefix
let copy_source = format!(
@@ -740,8 +777,10 @@ impl RemoteStorage for S3Bucket {
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res?;

View File

@@ -1,6 +1,7 @@
use std::fmt::Display;
use std::time::{Duration, Instant};
use metrics::IntCounter;
/// Circuit breakers are for operations that are expensive and fallible.
///
@@ -53,7 +54,7 @@ impl CircuitBreaker {
}
}
pub fn fail<E>(&mut self, error: E)
pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
where
E: Display,
{
@@ -63,18 +64,18 @@ impl CircuitBreaker {
self.fail_count += 1;
if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
self.break_circuit( error);
self.break_circuit(metric, error);
}
}
/// Call this after successfully executing an operation
pub fn success(&mut self) {
pub fn success(&mut self, metric: &IntCounter) {
self.fail_count = 0;
if let Some(broken_at) = &self.broken_at {
tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
humantime::format_duration(broken_at.elapsed()));
self.broken_at = None;
metric.inc();
}
}
@@ -97,13 +98,13 @@ impl CircuitBreaker {
}
}
fn break_circuit<E>(&mut self, error: E)
fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
where
E: Display,
{
self.broken_at = Some(Instant::now());
tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}");
metric.inc();
}
fn reset_circuit(&mut self) {

View File

@@ -73,6 +73,7 @@ impl From<GetVectoredError> for BasebackupError {
/// * When working without safekeepers. In this situation it is important to match the lsn
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
/// to start the replication.
#[allow(clippy::too_many_arguments)]
pub async fn send_basebackup_tarball<'a, W>(
write: &'a mut W,
timeline: &'a Timeline,
@@ -80,6 +81,7 @@ pub async fn send_basebackup_tarball<'a, W>(
prev_lsn: Option<Lsn>,
full_backup: bool,
replica: bool,
lazy_slru_download_enabled: bool,
ctx: &'a RequestContext,
) -> Result<(), BasebackupError>
where
@@ -131,8 +133,8 @@ where
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
backup_lsn, prev_lsn, full_backup, replica
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={}, lazy_slru_download_enabled={})",
backup_lsn, prev_lsn, full_backup, replica, lazy_slru_download_enabled
);
let basebackup = Basebackup {
@@ -142,6 +144,7 @@ where
prev_record_lsn: prev_lsn,
full_backup,
replica,
lazy_slru_download_enabled,
ctx,
io_concurrency: IoConcurrency::spawn_from_conf(
timeline.conf,
@@ -170,6 +173,7 @@ where
prev_record_lsn: Lsn,
full_backup: bool,
replica: bool,
lazy_slru_download_enabled: bool,
ctx: &'a RequestContext,
io_concurrency: IoConcurrency,
}
@@ -308,7 +312,10 @@ where
self.timeline.pg_version,
)?;
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
let lazy_slru_download = self
.timeline
.get_lazy_slru_download(self.lazy_slru_download_enabled)
&& !self.full_backup;
let pgversion = self.timeline.pg_version;
let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);

View File

@@ -20,6 +20,7 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
use pageserver::deletion_queue::DeletionQueue;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
};
@@ -320,9 +321,10 @@ where
}
}
fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
let elapsed = started_at.elapsed();
let secs = elapsed.as_secs_f64();
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
info!(
elapsed_ms = elapsed.as_millis(),
@@ -353,7 +355,10 @@ fn start_pageserver(
set_launch_timestamp_metric(launch_ts);
#[cfg(target_os = "linux")]
metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
metrics::register_internal(Box::new(
pageserver::metrics::tokio_epoll_uring::Collector::new(),
))
.unwrap();
pageserver::preinitialize_metrics(conf, ignored);
// If any failpoints were set from FAILPOINTS environment variable,
@@ -497,6 +502,7 @@ fn start_pageserver(
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
STARTUP_IS_LOADING.set(1);
// Startup staging or optimizing:
//
@@ -572,6 +578,7 @@ fn start_pageserver(
"initial_tenant_load",
"Initial load completed",
);
STARTUP_IS_LOADING.set(0);
});
let WaitForPhaseResult {

View File

@@ -261,7 +261,7 @@ where
let mut tenants = std::pin::pin!(tenants);
while let Some((tenant_id, tenant)) = tenants.next().await {
let tenant_resident_size = 0;
let mut tenant_resident_size = 0;
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;
@@ -286,6 +286,7 @@ where
}
}
tenant_resident_size += timeline.resident_physical_size();
}
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);

View File

@@ -91,12 +91,12 @@
use std::{sync::Arc, time::Duration};
use once_cell::sync::Lazy;
use tracing::warn;
use utils::{id::TimelineId, shard::TenantShardId};
use crate::{
metrics::TimelineMetrics,
metrics::{StorageIoSizeMetrics, TimelineMetrics},
task_mgr::TaskKind,
tenant::Timeline,
};
@@ -122,35 +122,38 @@ pub struct RequestContext {
#[derive(Clone)]
pub(crate) enum Scope {
Global {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTenant {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTimeline {
io_size_metrics: crate::metrics::StorageIoSizeMetrics,
},
Timeline {
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// context creation contending for the ref counters of the Arc<TimelineMetrics>,
// which are shared among all tasks that operate on the timeline, especially
// concurrent page_service connections.
#[allow(clippy::redundant_allocation)]
#[allow(dead_code)]
arc_arc: Arc<Arc<TimelineMetrics>>, },
arc_arc: Arc<Arc<TimelineMetrics>>,
},
#[cfg(test)]
UnitTest {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
DebugTools {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
}
static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
impl Scope {
pub(crate) fn new_global() -> Self {
Scope::Global {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
/// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -170,13 +173,18 @@ impl Scope {
}
}
pub(crate) fn new_secondary_timeline(
_tenant_shard_id: &TenantShardId,
_timeline_id: &TimelineId,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Self {
// TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
let timeline_id = timeline_id.to_string();
Scope::SecondaryTimeline { }
let io_size_metrics =
crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
Scope::SecondaryTimeline { io_size_metrics }
}
pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
// Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -189,19 +197,19 @@ impl Scope {
// like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
// at this point, so, we were able to completely side-step tenant-scoped stuff there).
Scope::SecondaryTenant {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
#[cfg(test)]
pub(crate) fn new_unit_test() -> Self {
Scope::UnitTest {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
pub(crate) fn new_debug_tools() -> Self {
Scope::DebugTools {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
}
@@ -515,18 +523,58 @@ impl RequestContext {
self.access_stats_behavior
}
pub(crate) fn page_content_kind(&self) -> PageContentKind {
self.page_content_kind
}
pub(crate) fn read_path_debug(&self) -> bool {
self.read_path_debug
}
pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
match &self.scope {
Scope::Global { io_size_metrics } => {
let is_unit_test = cfg!(test);
let is_regress_test_build = cfg!(feature = "testing");
if is_unit_test || is_regress_test_build {
panic!("all VirtualFile instances are timeline-scoped");
} else {
use once_cell::sync::Lazy;
use std::sync::Mutex;
use std::time::Duration;
use utils::rate_limit::RateLimit;
static LIMIT: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
let mut guard = LIMIT.lock().unwrap();
guard.call2(|rate_limit_stats| {
warn!(
%rate_limit_stats,
backtrace=%std::backtrace::Backtrace::force_capture(),
"all VirtualFile instances are timeline-scoped",
);
});
io_size_metrics
}
}
Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
#[cfg(test)]
Scope::UnitTest { io_size_metrics } => io_size_metrics,
Scope::DebugTools { io_size_metrics } => io_size_metrics,
}
}
pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
if duration == Duration::ZERO {
return;
}
match &self.scope {
Scope::Timeline { arc_arc: _ } => {},
Scope::Timeline { arc_arc } => arc_arc
.wait_ondemand_download_time
.observe(self.task_kind, duration),
_ => {
use once_cell::sync::Lazy;
use std::sync::Mutex;

View File

@@ -27,6 +27,7 @@ use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
use self::validator::Validator;
use crate::config::PageServerConf;
use crate::controller_upcall_client::StorageControllerUpcallApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
use crate::tenant::storage_layer::LayerName;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -162,6 +163,11 @@ struct TenantDeletionList {
generation: Generation,
}
impl TenantDeletionList {
pub(crate) fn len(&self) -> usize {
self.timelines.values().map(|v| v.len()).sum()
}
}
/// Files ending with this suffix will be ignored and erased
/// during recovery as startup.
@@ -461,6 +467,9 @@ impl DeletionQueueClient {
// they may be historical.
assert!(!current_generation.is_none());
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(layers.len() as u64);
self.do_push(
&self.tx,
ListWriterQueueMessage::Delete(DeletionOp {
@@ -544,6 +553,9 @@ impl DeletionQueueClient {
&self,
objects: Vec<RemotePath>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(objects.len() as u64);
self.executor_tx
.send(DeleterMessage::Delete(objects))
.await

View File

@@ -14,6 +14,7 @@ use tracing::{info, warn};
use utils::{backoff, pausable_failpoint};
use super::{DeletionQueueError, FlushOp};
use crate::metrics;
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
@@ -59,6 +60,10 @@ impl Deleter {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
});
@@ -85,6 +90,9 @@ impl Deleter {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some
// or all of the deleted objects were already gone.
metrics::DELETION_QUEUE
.keys_executed
.inc_by(self.accumulator.len() as u64);
info!(
"Executed deletion batch {}..{}",
self.accumulator
@@ -101,6 +109,10 @@ impl Deleter {
return Err(DeletionQueueError::ShuttingDown);
}
warn!("DeleteObjects request failed: {e:#}, will continue trying");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["execute"])
.inc();
}
};
}

View File

@@ -25,6 +25,7 @@ use utils::id::TimelineId;
use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
use crate::tenant::storage_layer::LayerName;
use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
@@ -151,7 +152,7 @@ impl ListWriter {
}
}
Err(e) => {
metrics::DELETION_QUEUE.unexpected_errors.inc();
warn!(
sequence = self.pending.sequence,
"Failed to write deletion list, will retry later ({e:#})"
@@ -179,6 +180,7 @@ impl ListWriter {
// This should never happen unless we make a mistake with our serialization.
// Ignoring a deletion header is not consequential for correctnes because all deletions
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
metrics::DELETION_QUEUE.unexpected_errors.inc();
Ok(None)
}
}
@@ -247,6 +249,7 @@ impl ListWriter {
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
};
@@ -254,6 +257,7 @@ impl ListWriter {
Ok(s) => s,
Err(e) => {
warn!("Malformed key '{basename}': {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
@@ -282,6 +286,7 @@ impl ListWriter {
// Drop the list on the floor: any objects it referenced will be left behind
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
@@ -324,6 +329,9 @@ impl ListWriter {
// We will drop out of recovery if this fails: it indicates that we are shutting down
// or the backend has panicked
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(deletion_list.len() as u64);
self.tx
.send(ValidatorQueueMessage::Delete(deletion_list))
.await?;
@@ -345,6 +353,7 @@ impl ListWriter {
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
self.conf.deletion_prefix(),
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
}
@@ -413,6 +422,7 @@ impl ListWriter {
tracing::error!(
"Failed to enqueue deletions, leaking objects. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
@@ -440,6 +450,7 @@ impl ListWriter {
tracing::error!(
"Deletion queue recovery called more than once. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
continue;
}
@@ -451,6 +462,7 @@ impl ListWriter {
info!(
"Deletion queue recover aborted, deletion queue will not proceed ({e})"
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
} else {
self.recovered = true;

View File

@@ -26,6 +26,7 @@ use super::deleter::DeleterMessage;
use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
use crate::config::PageServerConf;
use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi};
use crate::metrics;
use crate::virtual_file::MaybeFatalIo;
// After this length of time, do any validation work that is pending,
@@ -185,6 +186,7 @@ where
"Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
tenant_lsn_state.generation
);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
@@ -219,8 +221,11 @@ where
if !this_list_valid {
info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
}
} else {
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
}
this_list_valid
});
list.validated = true;
@@ -232,7 +237,7 @@ where
// Highly unexpected. Could happen if e.g. disk full.
// If we didn't save the trimmed list, it is _not_ valid to execute.
warn!("Failed to save modified deletion list {list}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Rather than have a complex retry process, just drop it and leak the objects,
// scrubber will clean up eventually.
@@ -271,7 +276,7 @@ where
// The save() function logs a warning on error.
if let Err(e) = header.save(self.conf).await {
warn!("Failed to write deletion queue header: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}

View File

@@ -56,6 +56,7 @@ use utils::completion;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::metrics::disk_usage_based_eviction::METRICS;
use crate::task_mgr::{self, BACKGROUND_RUNTIME};
use crate::tenant::mgr::TenantManager;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
@@ -387,7 +388,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
}
};
METRICS.layers_collected.inc_by(candidates.len() as u64);
tracing::info!(
elapsed_ms = collection_time.as_millis(),
@@ -427,7 +428,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
let (evicted_amount, usage_planned) =
select_victims(&candidates, usage_pre).into_amount_and_planned();
METRICS.layers_selected.inc_by(evicted_amount as u64);
// phase2: evict layers
@@ -456,6 +457,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
METRICS.layers_evicted.inc();
usage_assumed.add_available_bytes(file_size);
}
Ok(Err((
@@ -786,6 +788,7 @@ async fn collect_eviction_candidates(
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
// get a snapshot of the list of tenants
let tenants = tenant_manager
@@ -819,7 +822,7 @@ async fn collect_eviction_candidates(
continue;
}
let started_at = std::time::Instant::now();
// collect layers from all timelines in this tenant
//
@@ -914,11 +917,25 @@ async fn collect_eviction_candidates(
(partition, candidate)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
// Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -945,7 +962,7 @@ async fn collect_eviction_candidates(
layer_info.resident_layers.len()
);
let started_at = std::time::Instant::now();
layer_info
.resident_layers
@@ -967,13 +984,28 @@ async fn collect_eviction_candidates(
candidate,
)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
tokio::task::yield_now().await;
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
debug_assert!(

View File

@@ -24,6 +24,7 @@ use wal_decoder::models::InterpretedWalRecord;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::{WalIngest, WalIngestErrorKind};
@@ -323,6 +324,7 @@ async fn import_wal(
walingest
.ingest_record(interpreted, &mut modification, ctx)
.await?;
WAL_INGEST.records_committed.inc();
modification.commit(ctx).await?;
last_lsn = lsn;

File diff suppressed because it is too large Load Diff

View File

@@ -77,6 +77,7 @@ use anyhow::Context;
use once_cell::sync::OnceCell;
use crate::context::RequestContext;
use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
use crate::virtual_file::{IoBufferMut, IoPageSlice};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -194,7 +195,7 @@ impl SlotInner {
}
pub struct PageCache {
immutable_page_maps: [std::sync::RwLock<HashMap<(FileId, u32), usize>>; 16],
immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
@@ -204,103 +205,8 @@ pub struct PageCache {
/// Index of the next candidate to evict, for the Clock replacement algorithm.
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
}
impl PageCache {
/// Helper function to determine the shard index based on the low 4 bits of the u32 in the key tuple.
fn shard_index(_file_id: &FileId, blkno: u32) -> usize {
(blkno & 0xF) as usize
}
/// Search for a page in the cache using the given search key.
///
/// Returns the slot index, if any.
///
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
/// get recycled for an unrelated page immediately after this function
/// returns. The caller is responsible for re-checking that the slot still
/// contains the page with the same key before using it.
///
fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
match cache_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let shard_idx = Self::shard_index(file_id, *blkno);
let map = self.immutable_page_maps[shard_idx].read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
}
}
///
/// Remove mapping for given key.
///
fn remove_mapping(&self, old_key: &CacheKey) {
match old_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let shard_idx = Self::shard_index(file_id, *blkno);
let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
}
}
}
///
/// Insert mapping for given key.
///
/// If a mapping already existed for the given key, returns the slot index
/// of the existing mapping and leaves it untouched.
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
match new_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let shard_idx = Self::shard_index(file_id, *blkno);
let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
entry.insert(slot_idx);
None
}
}
}
}
}
/// Initialize a new page cache
///
/// This should be called only once at page server startup.
fn new(num_pages: usize) -> Self {
assert!(num_pages > 0, "page cache size must be > 0");
// We could use Vec::leak here, but that potentially also leaks
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
// this is avoided.
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
let slots = page_buffer
.chunks_exact_mut(PAGE_SZ)
.map(|chunk| {
// SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
Slot {
inner: tokio::sync::RwLock::new(SlotInner {
key: None,
buf,
permit: std::sync::Mutex::new(Weak::new()),
}),
usage_count: AtomicU8::new(0),
}
})
.collect();
Self {
immutable_page_maps: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit {
@@ -508,17 +414,32 @@ impl PageCache {
async fn lock_for_read(
&self,
cache_key: &CacheKey,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
let (read_access, hit) = match cache_key {
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
),
};
read_access.inc();
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
debug_assert!(permit.is_some());
is_first_iteration = false;
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
@@ -563,6 +484,63 @@ impl PageCache {
}
}
//
// Section 3: Mapping functions
//
/// Search for a page in the cache using the given search key.
///
/// Returns the slot index, if any.
///
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
/// get recycled for an unrelated page immediately after this function
/// returns. The caller is responsible for re-checking that the slot still
/// contains the page with the same key before using it.
///
fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
match cache_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
}
}
///
/// Remove mapping for given key.
///
fn remove_mapping(&self, old_key: &CacheKey) {
match old_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
}
}
}
///
/// Insert mapping for given key.
///
/// If a mapping already existed for the given key, returns the slot index
/// of the existing mapping and leaves it untouched.
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
match new_key {
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
entry.insert(slot_idx);
self.size_metrics.current_bytes_immutable.add_page_sz(1);
None
}
}
}
}
}
//
// Section 4: Misc internal helpers
//
@@ -617,7 +595,11 @@ impl PageCache {
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::ItersExceeded {
iters: iters.try_into().unwrap(),
},
);
anyhow::bail!("exceeded evict iter limit");
}
continue;
@@ -627,12 +609,84 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.key = None;
}
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
iters: iters.try_into().unwrap(),
},
);
} else {
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
iters: iters.try_into().unwrap(),
},
);
}
return Ok((slot_idx, inner));
}
}
}
/// Initialize a new page cache
///
/// This should be called only once at page server startup.
fn new(num_pages: usize) -> Self {
assert!(num_pages > 0, "page cache size must be > 0");
// We could use Vec::leak here, but that potentially also leaks
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
// this is avoided.
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
size_metrics.current_bytes_immutable.set_page_sz(0);
let slots = page_buffer
.chunks_exact_mut(PAGE_SZ)
.map(|chunk| {
// SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
Slot {
inner: tokio::sync::RwLock::new(SlotInner {
key: None,
buf,
permit: std::sync::Mutex::new(Weak::new()),
}),
usage_count: AtomicU8::new(0),
}
})
.collect();
Self {
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
}
trait PageSzBytesMetric {
fn set_page_sz(&self, count: usize);
fn add_page_sz(&self, count: usize);
fn sub_page_sz(&self, count: usize);
}
#[inline(always)]
fn count_times_page_sz(count: usize) -> u64 {
u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
}
impl PageSzBytesMetric for metrics::UIntGauge {
fn set_page_sz(&self, count: usize) {
self.set(count_times_page_sz(count));
}
fn add_page_sz(&self, count: usize) {
self.add(count_times_page_sz(count));
}
fn sub_page_sz(&self, count: usize) {
self.sub(count_times_page_sz(count));
}
}

View File

@@ -59,7 +59,8 @@ use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::metrics::{
self, GetPageBatchBreakReason, SmgrOpTimer, TimelineMetrics,
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
};
use crate::pgdatadir_mapping::Version;
use crate::span::{
@@ -274,6 +275,9 @@ async fn page_service_conn_main(
cancel: CancellationToken,
gate_guard: GateGuard,
) -> ConnectionHandlerResult {
let _guard = LIVE_CONNECTIONS
.with_label_values(&["page_service"])
.guard();
socket
.set_nodelay(true)
@@ -637,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
effective_request_lsn: Lsn,
ctx: RequestContext,
}
@@ -644,6 +649,7 @@ struct BatchedGetPageRequest {
#[cfg(feature = "testing")]
struct BatchedTestRequest {
req: models::PagestreamTestRequest,
timer: SmgrOpTimer,
}
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
@@ -653,13 +659,13 @@ struct BatchedTestRequest {
enum BatchedFeMessage {
Exists {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
req: models::PagestreamExistsRequest,
},
Nblocks {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
req: models::PagestreamNblocksRequest,
},
@@ -671,13 +677,13 @@ enum BatchedFeMessage {
},
DbSize {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
req: models::PagestreamDbSizeRequest,
},
GetSlruSegment {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
req: models::PagestreamGetSlruSegmentRequest,
},
@@ -698,7 +704,27 @@ impl BatchedFeMessage {
self.into()
}
fn observe_execution_start(&mut self, _at: Instant) {
fn observe_execution_start(&mut self, at: Instant) {
match self {
BatchedFeMessage::Exists { timer, .. }
| BatchedFeMessage::Nblocks { timer, .. }
| BatchedFeMessage::DbSize { timer, .. }
| BatchedFeMessage::GetSlruSegment { timer, .. } => {
timer.observe_execution_start(at);
}
BatchedFeMessage::GetPage { pages, .. } => {
for page in pages {
page.timer.observe_execution_start(at);
}
}
#[cfg(feature = "testing")]
BatchedFeMessage::Test { requests, .. } => {
for req in requests {
req.timer.observe_execution_start(at);
}
}
BatchedFeMessage::RespondError { .. } => {}
}
}
fn should_break_batch(
@@ -938,7 +964,7 @@ impl PageServerHandler {
.await?;
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetRelExists,
received_at,
@@ -946,7 +972,7 @@ impl PageServerHandler {
.await?;
BatchedFeMessage::Exists {
span,
timer,
shard: shard.downgrade(),
req,
}
@@ -956,7 +982,7 @@ impl PageServerHandler {
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetRelSize,
received_at,
@@ -964,7 +990,7 @@ impl PageServerHandler {
.await?;
BatchedFeMessage::Nblocks {
span,
timer,
shard: shard.downgrade(),
req,
}
@@ -974,7 +1000,7 @@ impl PageServerHandler {
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetDbSize,
received_at,
@@ -982,7 +1008,7 @@ impl PageServerHandler {
.await?;
BatchedFeMessage::DbSize {
span,
timer,
shard: shard.downgrade(),
req,
}
@@ -992,7 +1018,7 @@ impl PageServerHandler {
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetSlruSegment,
received_at,
@@ -1000,7 +1026,7 @@ impl PageServerHandler {
.await?;
BatchedFeMessage::GetSlruSegment {
span,
timer,
shard: shard.downgrade(),
req,
}
@@ -1099,7 +1125,7 @@ impl PageServerHandler {
// request handler log messages contain the request-specific fields.
let span = mkspan!(shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetPageAtLsn,
received_at,
@@ -1132,6 +1158,7 @@ impl PageServerHandler {
shard: shard.downgrade(),
pages: smallvec::smallvec![BatchedGetPageRequest {
req,
timer,
effective_request_lsn,
ctx,
}],
@@ -1147,12 +1174,13 @@ impl PageServerHandler {
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
let timer =
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
.await?;
BatchedFeMessage::Test {
span,
shard: shard.downgrade(),
requests: vec![BatchedTestRequest { req, }],
requests: vec![BatchedTestRequest { req, timer }],
}
}
};
@@ -1253,7 +1281,7 @@ impl PageServerHandler {
// Dispatch the batch to the appropriate request handler.
let log_slow_name = batch.as_static_str();
let (handler_results, span) = {
let (mut handler_results, span) = {
// TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
// won't fit on the stack.
let mut boxpinned =
@@ -1283,31 +1311,31 @@ impl PageServerHandler {
// call, which (all unmeasured) adds syscall overhead but reduces time to first byte
// and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
// TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
// let flush_timers = {
// let flushing_start_time = Instant::now();
// let mut flush_timers = Vec::with_capacity(handler_results.len());
// for handler_result in &mut handler_results {
// let flush_timer = match handler_result {
// Ok((_, timer)) => Some(
// timer
// .observe_execution_end(flushing_start_time)
// .expect("we are the first caller"),
// ),
// Err(_) => {
// // TODO: measure errors
// None
// }
// };
// flush_timers.push(flush_timer);
// }
// assert_eq!(flush_timers.len(), handler_results.len());
// flush_timers
// };
let flush_timers = {
let flushing_start_time = Instant::now();
let mut flush_timers = Vec::with_capacity(handler_results.len());
for handler_result in &mut handler_results {
let flush_timer = match handler_result {
Ok((_, timer)) => Some(
timer
.observe_execution_end(flushing_start_time)
.expect("we are the first caller"),
),
Err(_) => {
// TODO: measure errors
None
}
};
flush_timers.push(flush_timer);
}
assert_eq!(flush_timers.len(), handler_results.len());
flush_timers
};
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
for handler_result in handler_results.into_iter() {
for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
let response_msg = match handler_result {
Err(e) => match &e.err {
PageStreamError::Shutdown => {
@@ -1339,7 +1367,7 @@ impl PageServerHandler {
})
}
},
Ok((response_msg, )) => response_msg,
Ok((response_msg, _op_timer_already_observed)) => response_msg,
};
//
@@ -1353,17 +1381,17 @@ impl PageServerHandler {
failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
// what we want to do
let socket_fd = pgb_writer.socket_fd;
let flush_fut = pgb_writer.flush();
// metric for how long flushing takes
// let flush_fut = match flushing_timer {
// Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
// Instant::now(),
// flush_fut,
// socket_fd,
// )),
// None => futures::future::Either::Right(flush_fut),
// };
let flush_fut = match flushing_timer {
Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
Instant::now(),
flush_fut,
socket_fd,
)),
None => futures::future::Either::Right(flush_fut),
};
// do it while respecting cancellation
let _: () = async move {
tokio::select! {
@@ -1393,7 +1421,7 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<
(
Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>>,
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
Span,
),
QueryError,
@@ -1409,7 +1437,7 @@ impl PageServerHandler {
Ok(match batch {
BatchedFeMessage::Exists {
span,
timer,
shard,
req,
} => {
@@ -1420,7 +1448,7 @@ impl PageServerHandler {
self.handle_get_rel_exists_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, ))
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
],
span,
@@ -1428,7 +1456,7 @@ impl PageServerHandler {
}
BatchedFeMessage::Nblocks {
span,
timer,
shard,
req,
} => {
@@ -1439,7 +1467,7 @@ impl PageServerHandler {
self.handle_get_nblocks_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, ))
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
],
span,
@@ -1475,6 +1503,7 @@ impl PageServerHandler {
}
BatchedFeMessage::DbSize {
span,
timer,
shard,
req,
} => {
@@ -1485,7 +1514,7 @@ impl PageServerHandler {
self.handle_db_size_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, ))
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
],
span,
@@ -1493,6 +1522,7 @@ impl PageServerHandler {
}
BatchedFeMessage::GetSlruSegment {
span,
timer,
shard,
req,
} => {
@@ -1503,7 +1533,7 @@ impl PageServerHandler {
self.handle_get_slru_segment_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, ))
.map(|msg| (msg, timer))
.map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
],
span,
@@ -2149,11 +2179,15 @@ impl PageServerHandler {
timeline: &Timeline,
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
io_concurrency: IoConcurrency,
_batch_break_reason: GetPageBatchBreakReason,
batch_break_reason: GetPageBatchBreakReason,
ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>> {
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
debug_assert_current_span_has_tenant_and_timeline_id();
timeline
.query_metrics
.observe_getpage_batch_start(requests.len(), batch_break_reason);
// If a page trace is running, submit an event for this request.
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
let time = SystemTime::now();
@@ -2253,7 +2287,7 @@ impl PageServerHandler {
req: req.req,
page,
}),
req.timer,
)
})
.map_err(|e| BatchedPageStreamError {
@@ -2298,7 +2332,7 @@ impl PageServerHandler {
timeline: &Timeline,
requests: Vec<BatchedTestRequest>,
_ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage,), BatchedPageStreamError>> {
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
// real requests would do something with the timeline
let mut results = Vec::with_capacity(requests.len());
for _req in requests.iter() {
@@ -2324,6 +2358,7 @@ impl PageServerHandler {
PagestreamBeMessage::Test(models::PagestreamTestResponse {
req: req.req.clone(),
}),
req.timer,
)
})
.map_err(|e| BatchedPageStreamError {
@@ -2359,6 +2394,7 @@ impl PageServerHandler {
full_backup: bool,
gzip: bool,
replica: bool,
lazy_slru_download: bool,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
@@ -2426,6 +2462,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
lazy_slru_download,
&ctx,
)
.await
@@ -2449,6 +2486,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
lazy_slru_download,
&ctx,
)
.await
@@ -2466,6 +2504,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
lazy_slru_download,
&ctx,
)
.await
@@ -2515,7 +2554,7 @@ impl PageServerHandler {
}
}
/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
/// `basebackup tenant timeline [lsn] [--gzip] [--replica] [--lazy-slru-download]`
#[derive(Debug, Clone, Eq, PartialEq)]
struct BaseBackupCmd {
tenant_id: TenantId,
@@ -2523,6 +2562,7 @@ struct BaseBackupCmd {
lsn: Option<Lsn>,
gzip: bool,
replica: bool,
lazy_slru_download: bool,
}
/// `fullbackup tenant timeline [lsn] [prev_lsn]`
@@ -2655,6 +2695,7 @@ impl BaseBackupCmd {
let mut gzip = false;
let mut replica = false;
let mut lazy_slru_download = false;
for &param in &parameters[flags_parse_from..] {
match param {
@@ -2670,6 +2711,12 @@ impl BaseBackupCmd {
}
replica = true
}
"--lazy-slru-download" => {
if lazy_slru_download {
bail!("duplicate parameter for basebackup command: {param}")
}
lazy_slru_download = true
}
_ => bail!("invalid parameter for basebackup command: {param}"),
}
}
@@ -2679,6 +2726,7 @@ impl BaseBackupCmd {
lsn,
gzip,
replica,
lazy_slru_download,
})
}
}
@@ -2878,7 +2926,12 @@ where
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
let command_kind = match protocol_version {
PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
};
COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
.await?;
}
@@ -2888,6 +2941,7 @@ where
lsn,
gzip,
replica,
lazy_slru_download,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
@@ -2895,7 +2949,10 @@ where
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
let res = async {
self.handle_basebackup_request(
pgb,
@@ -2906,6 +2963,7 @@ where
false,
gzip,
replica,
lazy_slru_download,
&ctx,
)
.await?;
@@ -2913,7 +2971,7 @@ where
Result::<(), QueryError>::Ok(())
}
.await;
metric_recording.observe(&res);
res?;
}
// same as basebackup, but result includes relational data as well
@@ -2929,7 +2987,9 @@ where
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();
// Check that the timeline exists
self.handle_basebackup_request(
@@ -2941,6 +3001,7 @@ where
true,
false,
false,
false,
&ctx,
)
.await?;
@@ -2963,7 +3024,9 @@ where
self.check_permission(Some(tenant_shard_id.tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
@@ -3073,7 +3136,8 @@ mod tests {
timeline_id,
lsn: None,
gzip: false,
replica: false
replica: false,
lazy_slru_download: false
})
);
let cmd =
@@ -3085,7 +3149,8 @@ mod tests {
timeline_id,
lsn: None,
gzip: true,
replica: false
replica: false,
lazy_slru_download: false
})
);
let cmd =
@@ -3097,7 +3162,8 @@ mod tests {
timeline_id,
lsn: None,
gzip: false,
replica: false
replica: false,
lazy_slru_download: false
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
@@ -3109,7 +3175,8 @@ mod tests {
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: false,
replica: false
replica: false,
lazy_slru_download: false
})
);
let cmd = PageServiceCmd::parse(&format!(
@@ -3123,7 +3190,23 @@ mod tests {
timeline_id,
lsn: None,
gzip: true,
replica: true
replica: true,
lazy_slru_download: false
})
);
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --replica --gzip --lazy-slru-download"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: true,
replica: true,
lazy_slru_download: true
})
);
let cmd = PageServiceCmd::parse(&format!(
@@ -3137,7 +3220,8 @@ mod tests {
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: true,
replica: true
replica: true,
lazy_slru_download: false
})
);
let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();

View File

@@ -36,13 +36,15 @@ use tracing::{debug, info, info_span, trace, warn};
use utils::bin_ser::{BeSer, DeserializeError};
use utils::lsn::Lsn;
use utils::pausable_failpoint;
use wal_decoder::serialized_batch::SerializedValueBatch ;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use super::tenant::{PageReconstructError, Timeline};
use crate::aux_file;
use crate::context::{PerfInstrumentFutureExt, RequestContext};
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::{
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
};
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -1030,16 +1032,19 @@ impl Timeline {
)
.await?;
let mut result = HashMap::new();
let mut sz = 0;
for (_, v) in kv {
let v = v?;
let v = aux_file::decode_file_value_bytes(&v)
.context("value decode")
.map_err(PageReconstructError::Other)?;
for (fname, content) in v {
sz += fname.len();
sz += content.len();
result.insert(fname, content);
}
}
self.aux_file_size_estimator.on_initial(sz);
Ok(result)
}
@@ -1310,12 +1315,12 @@ impl Timeline {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
}
RELSIZE_CACHE_MISSES.inc();
None
}
@@ -1340,21 +1345,25 @@ impl Timeline {
}
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_CACHE_ENTRIES.inc();
}
}
}
/// Store cached relation size
pub fn set_cached_rel_size(&self, _tag: RelTag, _lsn: Lsn, _nblocks: BlockNumber) {
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_CACHE_ENTRIES.inc();
}
}
/// Remove cached relation size
pub fn remove_cached_rel_size(&self, _tag: &RelTag) {
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.remove(tag).is_some() {
RELSIZE_CACHE_ENTRIES.dec();
}
}
}
@@ -1429,7 +1438,25 @@ impl DatadirModification<'_> {
.is_some_and(|b| b.has_data())
}
/// Returns statistics about the currently pending modifications.
pub(crate) fn stats(&self) -> DatadirModificationStats {
let mut stats = DatadirModificationStats::default();
for (_, _, value) in self.pending_metadata_pages.values().flatten() {
match value {
Value::Image(_) => stats.metadata_images += 1,
Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
Value::WalRecord(_) => stats.metadata_deltas += 1,
}
}
for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
match valuemeta {
ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
ValueMeta::Serialized(_) => stats.data_deltas += 1,
ValueMeta::Observed(_) => {}
}
}
stats
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
@@ -2304,15 +2331,20 @@ impl DatadirModification<'_> {
}
let mut new_files = other_files;
match (modifying_file, content.is_empty()) {
(Some(_old_content), false) => {
(Some(old_content), false) => {
self.tline
.aux_file_size_estimator
.on_update(old_content.len(), content.len());
new_files.push((path, content));
}
(Some(_old_content), true) => {
(Some(old_content), true) => {
self.tline
.aux_file_size_estimator
.on_remove(old_content.len());
// not adding the file key to the final `new_files` vec.
}
(None, false) => {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
// Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.

View File

@@ -83,6 +83,11 @@ use crate::context::RequestContextBuilder;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::{
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
};
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
use crate::tenant::gc_result::GcResult;
@@ -1353,7 +1358,7 @@ impl Tenant {
let starting_up = init_order.is_some();
scopeguard::defer! {
if starting_up {
TENANT.startup_complete.inc();
}
}
@@ -1456,7 +1461,7 @@ impl Tenant {
let preload = match &mode {
SpawnMode::Eager | SpawnMode::Lazy => {
let _preload_timer = TENANT.preload.start_timer();
let res = tenant_clone
.preload(&remote_storage, task_mgr::shutdown_token())
.await;
@@ -1478,7 +1483,7 @@ impl Tenant {
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
let attach_start = std::time::Instant::now();
let attached = {
let _attach_timer = Some(TENANT.attach.start_timer());
tenant_clone.attach(preload, &ctx).await
};
let attach_duration = attach_start.elapsed();
@@ -3180,7 +3185,7 @@ impl Tenant {
self.compaction_circuit_breaker
.lock()
.unwrap()
.success();
.success(&CIRCUIT_BREAKERS_UNBROKEN);
match has_pending {
true => Ok(CompactionOutcome::Pending),
@@ -3201,13 +3206,13 @@ impl Tenant {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail( err);
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::Other(err) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail( err);
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::AlreadyRunning(_) => {}
}
@@ -3387,7 +3392,7 @@ impl Tenant {
"activation attempt finished"
);
TENANT.activation.observe(elapsed.as_secs_f64());
});
}
}
@@ -3512,6 +3517,7 @@ impl Tenant {
// Wait for any in-flight operations to complete
self.gate.close().await;
remove_tenant_metrics(&self.tenant_shard_id);
Ok(())
}
@@ -3844,13 +3850,33 @@ impl Tenant {
}
pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
TopTenantShardItem {
let mut result = TopTenantShardItem {
id: self.tenant_shard_id,
resident_size: 0,
physical_size: 0,
max_logical_size: 0,
max_logical_size_per_shard: 0,
};
for timeline in self.timelines.lock().unwrap().values() {
result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
result.physical_size += timeline
.remote_client
.metrics
.remote_physical_size_gauge
.get();
result.max_logical_size = std::cmp::max(
result.max_logical_size,
timeline.metrics.current_logical_size_gauge.get(),
);
}
result.max_logical_size_per_shard = result
.max_logical_size
.div_ceil(self.tenant_shard_id.shard_count.count() as u64);
result
}
}
@@ -4217,15 +4243,55 @@ impl Tenant {
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
loop {
// reflect tenant state in metrics:
// - global per tenant state: TENANT_STATE_METRIC
// - "set" of broken tenants: BROKEN_TENANTS_SET
//
// set of broken tenants should not have zero counts so that it remains accessible for
// alerting.
let tid = tenant_shard_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
let set_key = &[tid.as_str(), shard_id.as_str()][..];
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
([state.into()], matches!(state, TenantState::Broken { .. }))
}
let mut tuple = inspect_state(&rx.borrow_and_update());
let is_broken = tuple.1;
let mut counted_broken = if is_broken {
// add the id to the set right away, there should not be any updates on the channel
// after before tenant is removed, if ever
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
true
} else {
false
};
loop {
let labels = &tuple.0;
let current = TENANT_STATE_METRIC.with_label_values(labels);
current.inc();
if rx.changed().await.is_err() {
// tenant has been dropped
current.dec();
drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
break;
}
current.dec();
tuple = inspect_state(&rx.borrow_and_update());
let is_broken = tuple.1;
if is_broken && !counted_broken {
counted_broken = true;
// insert the tenant_id (back) into the set while avoiding needless counter
// access
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
}
}
});
@@ -4600,6 +4666,10 @@ impl Tenant {
let now = SystemTime::now();
target.leases.retain(|_, lease| !lease.is_expired(&now));
timeline
.metrics
.valid_lsn_lease_count_gauge
.set(target.leases.len() as u64);
// Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
@@ -4609,6 +4679,22 @@ impl Tenant {
}
}
// Update metrics that depend on GC state
timeline
.metrics
.archival_size
.set(if target.within_ancestor_pitr {
timeline.metrics.current_logical_size_gauge.get()
} else {
0
});
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.time)
.unwrap_or(Lsn(0))
.0,
);
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
// - this timeline was created while we were finding cutoffs
@@ -5358,6 +5444,10 @@ impl Tenant {
// Only shard zero should be calculating synthetic sizes
debug_assert!(self.shard_identity.is_shard_zero());
TENANT_SYNTHETIC_SIZE_METRIC
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
.unwrap()
.set(size);
}
pub fn cached_synthetic_size(&self) -> u64 {
@@ -5434,7 +5524,16 @@ impl Tenant {
/// than they report here, due to layer eviction. Tenants with many active branches may
/// actually use more than they report here.
pub(crate) fn local_storage_wanted(&self) -> u64 {
1000
let timelines = self.timelines.lock().unwrap();
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
// reflects the observation that on tenants with multiple large branches, typically only one
// of them is used actively enough to occupy space on disk.
timelines
.values()
.map(|t| t.metrics.visible_physical_size_gauge.get())
.max()
.unwrap_or(0)
}
/// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
@@ -5518,11 +5617,16 @@ async fn run_initdb(
);
let _permit = {
let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
INIT_DB_SEMAPHORE.acquire().await
};
CONCURRENT_INITDBS.inc();
scopeguard::defer! {
CONCURRENT_INITDBS.dec();
}
let _timer = INITDB_RUN_TIME.start_timer();
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser: &conf.superuser,
locale: &conf.locale,

View File

@@ -37,63 +37,6 @@ pub struct CompressionInfo {
pub compressed_size: Option<usize>,
}
/// A blob header, with header+data length and compression info.
///
/// TODO: use this more widely, and add an encode() method too.
/// TODO: document the header format.
#[derive(Clone, Copy, Default)]
pub struct Header {
pub header_len: usize,
pub data_len: usize,
pub compression_bits: u8,
}
impl Header {
/// Decodes a header from a byte slice.
pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
let Some(&first_header_byte) = bytes.first() else {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"zero-length blob header",
));
};
// If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
if first_header_byte < 0x80 {
return Ok(Self {
header_len: 1, // by definition
data_len: first_header_byte as usize,
compression_bits: BYTE_UNCOMPRESSED,
});
}
// Otherwise, this is a 4-byte header containing compression information and length.
const HEADER_LEN: usize = 4;
let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("blob header too short: {bytes:?}"),
)
})?;
// TODO: verify the compression bits and convert to an enum.
let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
header_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
let data_len = u32::from_be_bytes(header_buf) as usize;
Ok(Self {
header_len: HEADER_LEN,
data_len,
compression_bits,
})
}
/// Returns the total header+data length.
pub fn total_len(&self) -> usize {
self.header_len + self.data_len
}
}
impl BlockCursor<'_> {
/// Read a blob into a new buffer.
pub async fn read_blob(

View File

@@ -44,7 +44,7 @@ use crate::controller_upcall_client::{
};
use crate::deletion_queue::DeletionQueueClient;
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
use crate::metrics::TENANT_MANAGER as METRICS;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -519,7 +519,7 @@ pub async fn init_tenant_mgr(
tenant_configs.len(),
conf.concurrent_tenant_warmup.initial_permits()
);
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
// Accumulate futures for writing tenant configs, so that we can execute in parallel
let mut config_write_futs = Vec::new();
@@ -2177,7 +2177,9 @@ impl TenantManager {
// we would use if not doing any eviction.
progress.bytes_total
} else {
42
// In the absence of heatmap info, assume that the secondary location simply
// needs as much space as it is currently using.
secondary.resident_size_metric.get()
}
}
}
@@ -2528,7 +2530,7 @@ impl SlotGuard {
Ok(())
}
None => {
METRICS.unexpected_errors.inc();
error!(
tenant_shard_id = %self.tenant_shard_id,
"Missing InProgress marker during tenant upsert, this is a bug."
@@ -2538,7 +2540,7 @@ impl SlotGuard {
))
}
Some(slot) => {
METRICS.unexpected_errors.inc();
error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug. Contents: {:?}", slot);
Err(TenantSlotUpsertError::InternalError(
"Unexpected contents of TenantSlot".into(),
@@ -2619,7 +2621,7 @@ impl Drop for SlotGuard {
match m.entry(self.tenant_shard_id) {
Entry::Occupied(mut entry) => {
if !matches!(entry.get(), TenantSlot::InProgress(_)) {
METRICS.unexpected_errors.inc();
error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug. Contents: {:?}", entry.get());
}
@@ -2634,7 +2636,7 @@ impl Drop for SlotGuard {
}
}
Entry::Vacant(_) => {
METRICS.unexpected_errors.inc();
error!(
tenant_shard_id = %self.tenant_shard_id,
"Missing InProgress marker during SlotGuard drop, this is a bug."
@@ -2694,7 +2696,7 @@ fn tenant_map_acquire_slot_impl(
mode: TenantSlotAcquireMode,
) -> Result<SlotGuard, TenantSlotError> {
use TenantSlotAcquireMode::*;
METRICS.tenant_slot_writes.inc();
let mut locked = tenants.write().unwrap();
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());

View File

@@ -223,8 +223,9 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::metrics::{
MeasureRemoteOp,
RemoteOpFileKind, RemoteOpKind,
MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize,
};
use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
use crate::tenant::metadata::TimelineMetadata;
@@ -356,6 +357,8 @@ pub(crate) struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
@@ -402,6 +405,10 @@ impl RemoteTimelineClient {
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&tenant_shard_id,
&timeline_id,
)),
config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
cancel: CancellationToken::new(),
}
@@ -590,13 +597,21 @@ impl RemoteTimelineClient {
.map_err(|_| UploadQueueNotReadyError)
}
fn update_remote_physical_size_gauge(&self, _current_remote_index_part: Option<&IndexPart>) {
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
.layer_metadata
.values()
.map(|ilmd| ilmd.file_size)
.sum()
} else {
0
};
self.metrics.remote_physical_size_gauge.set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
0
self.metrics.remote_physical_size_gauge.get()
}
//
@@ -611,6 +626,13 @@ impl RemoteTimelineClient {
&self,
cancel: &CancellationToken,
) -> Result<MaybeDeletedIndexPart, DownloadError> {
let _unfinished_gauge_guard = self.metrics.call_begin(
&RemoteOpFileKind::Index,
&RemoteOpKind::Download,
crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
reason: "no need for a downloads gauge",
},
);
let (index_part, index_generation, index_last_modified) = download::download_index_part(
&self.storage_impl,
@@ -623,7 +645,7 @@ impl RemoteTimelineClient {
Option::<TaskKind>::None,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?;
@@ -698,7 +720,13 @@ impl RemoteTimelineClient {
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
&RemoteOpFileKind::Layer,
&RemoteOpKind::Download,
crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
reason: "no need for a downloads gauge",
},
);
download::download_layer_file(
self.conf,
&self.storage_impl,
@@ -715,11 +743,13 @@ impl RemoteTimelineClient {
Some(ctx.task_kind()),
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?
};
REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
Ok(downloaded_size)
}
@@ -997,6 +1027,7 @@ impl RemoteTimelineClient {
let op = UploadOp::UploadMetadata {
uploaded: Box::new(index_part.clone()),
};
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -1234,6 +1265,7 @@ impl RemoteTimelineClient {
);
let op = UploadOp::UploadLayer(layer, metadata, None);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -1410,6 +1442,7 @@ impl RemoteTimelineClient {
let op = UploadOp::Delete(Delete {
layers: with_metadata,
});
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -2147,7 +2180,7 @@ impl RemoteTimelineClient {
Some(TaskKind::RemoteUploadTask),
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
}
@@ -2164,7 +2197,7 @@ impl RemoteTimelineClient {
Some(TaskKind::RemoteUploadTask),
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await;
if res.is_ok() {
@@ -2310,7 +2343,10 @@ impl RemoteTimelineClient {
upload_queue.clean.1 = Some(task.task_id);
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
self.metrics
.projected_remote_consistent_lsn_gauge
.set(lsn.0);
if self.generation.is_none() {
// Legacy mode: skip validating generation
upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -2351,6 +2387,64 @@ impl RemoteTimelineClient {
.await;
}
self.metric_end(&task.op);
for coalesced_op in &task.coalesced_ops {
self.metric_end(coalesced_op);
}
}
fn metric_impl(
&self,
op: &UploadOp,
) -> Option<(
RemoteOpFileKind,
RemoteOpKind,
RemoteTimelineClientMetricsCallTrackSize,
)> {
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
let res = match op {
UploadOp::UploadLayer(_, m, _) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
),
UploadOp::UploadMetadata { .. } => (
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
DontTrackSize {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(_delete) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
},
),
UploadOp::Barrier(..) | UploadOp::Shutdown => {
// we do not account these
return None;
}
};
Some(res)
}
fn metric_begin(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
Some(x) => x,
None => return,
};
let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
guard.will_decrement_manually(); // in metric_end(), see right below
}
fn metric_end(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
Some(x) => x,
None => return,
};
self.metrics.call_end(&file_kind, &op_kind, track_bytes);
}
/// Close the upload queue for new operations and cancel queued operations.
@@ -2430,6 +2524,7 @@ impl RemoteTimelineClient {
// Tear down queued ops
for op in qi.queued_operations.into_iter() {
self.metric_end(&op);
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
// which is exactly what we want to happen.
drop(op);
@@ -2739,6 +2834,10 @@ mod tests {
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_shard_id,
&TIMELINE_ID,
)),
config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
cancel: CancellationToken::new(),
})
@@ -2965,7 +3064,99 @@ mod tests {
);
}
#[tokio::test]
async fn bytes_unfinished_gauge_for_layer_file_uploads() {
// Setup
let TestSetup {
harness,
tenant: _tenant,
timeline,
..
} = TestSetup::new("metrics").await.unwrap();
let client = &timeline.remote_client;
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&layer_file_name_1,
&harness.generation,
);
let content_1 = dummy_contents("foo");
std::fs::write(&local_path, &content_1).unwrap();
let layer_file_1 = Layer::for_resident(
harness.conf,
&timeline,
local_path,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
);
#[derive(Debug, PartialEq, Clone, Copy)]
struct BytesStartedFinished {
started: Option<usize>,
finished: Option<usize>,
}
impl std::ops::Add for BytesStartedFinished {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self {
started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
}
}
}
let get_bytes_started_stopped = || {
let started = client
.metrics
.get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
.map(|v| v.try_into().unwrap());
let stopped = client
.metrics
.get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
.map(|v| v.try_into().unwrap());
BytesStartedFinished {
started,
finished: stopped,
}
};
// Test
tracing::info!("now doing actual test");
let actual_a = get_bytes_started_stopped();
client
.schedule_layer_file_upload(layer_file_1.clone())
.unwrap();
let actual_b = get_bytes_started_stopped();
client.wait_completion().await.unwrap();
let actual_c = get_bytes_started_stopped();
// Validate
let expected_b = actual_a
+ BytesStartedFinished {
started: Some(content_1.len()),
// assert that the _finished metric is created eagerly so that subtractions work on first sample
finished: Some(0),
};
assert_eq!(actual_b, expected_b);
let expected_c = actual_a
+ BytesStartedFinished {
started: Some(content_1.len()),
finished: Some(content_1.len()),
};
assert_eq!(actual_c, expected_c);
}
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_index_part = IndexPart::example();

View File

@@ -6,6 +6,7 @@ mod scheduler;
use std::sync::Arc;
use std::time::SystemTime;
use metrics::UIntGauge;
use pageserver_api::models;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use remote_storage::GenericRemoteStorage;
@@ -25,6 +26,7 @@ use super::span::debug_assert_current_span_has_tenant_id;
use super::storage_layer::LayerName;
use crate::context::RequestContext;
use crate::disk_usage_eviction_task::DiskUsageEvictionInfo;
use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE};
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
enum DownloadCommand {
@@ -107,7 +109,12 @@ pub(crate) struct SecondaryTenant {
// Public state indicating overall progress of downloads relative to the last heatmap seen
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
// Sum of layer sizes on local disk
pub(super) resident_size_metric: UIntGauge,
// Sum of layer sizes in the most recently downloaded heatmap
pub(super) heatmap_total_size_metric: UIntGauge,
}
impl SecondaryTenant {
@@ -117,8 +124,16 @@ impl SecondaryTenant {
tenant_conf: pageserver_api::models::TenantConfig,
config: &SecondaryLocationConfig,
) -> Arc<Self> {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id])
.unwrap();
let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id])
.unwrap();
Arc::new(Self {
tenant_shard_id,
// todo: shall we make this a descendent of the
@@ -135,10 +150,14 @@ impl SecondaryTenant {
progress: std::sync::Mutex::default(),
resident_size_metric,
heatmap_total_size_metric,
})
}
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
self.tenant_shard_id
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
@@ -150,10 +169,15 @@ impl SecondaryTenant {
// Metrics are subtracted from and/or removed eagerly.
// Deletions are done in the background via [`BackgroundPurges::spawn`].
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
self.detail
.lock()
.unwrap()
.drain_timelines(&self.tenant_shard_id);
.drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
}
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -231,7 +255,7 @@ impl SecondaryTenant {
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(removed) =
detail.evict_layer(name, &timeline_id, now)
detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
{
// We might race with removal of the same layer during downloads, so finding the layer we
// were trying to remove is optional. Only issue the disk I/O to remove it if we found it.
@@ -245,9 +269,10 @@ impl SecondaryTenant {
/// Exhaustive check that incrementally updated metrics match the actual state.
#[cfg(feature = "testing")]
fn validate_metrics(&self) {
let detail = self.detail.lock().unwrap();
let resident_size = detail.total_resident_size();
assert_eq!(resident_size, self.resident_size_metric.get());
}
#[cfg(not(feature = "testing"))]

View File

@@ -4,9 +4,11 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime};
use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use metrics::UIntGauge;
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
@@ -31,6 +33,7 @@ use crate::context::RequestContext;
use crate::disk_usage_eviction_task::{
DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32,
};
use crate::metrics::SECONDARY_MODE;
use crate::tenant::config::SecondaryLocationConfig;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::ephemeral_file::is_ephemeral_file;
@@ -117,6 +120,9 @@ impl OnDiskState {
.fatal_err("Deleting secondary layer")
}
pub(crate) fn file_size(&self) -> u64 {
self.metadata.file_size
}
}
pub(super) struct SecondaryDetailTimeline {
@@ -169,9 +175,13 @@ impl SecondaryDetailTimeline {
pub(super) fn remove_layer(
&mut self,
name: &LayerName,
resident_metric: &UIntGauge,
) -> Option<OnDiskState> {
self.on_disk_layers.remove(name)
let removed = self.on_disk_layers.remove(name);
if let Some(removed) = &removed {
resident_metric.sub(removed.file_size());
}
removed
}
/// `local_path`
@@ -181,6 +191,7 @@ impl SecondaryDetailTimeline {
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
touched: &HeatMapLayer,
resident_metric: &UIntGauge,
local_path: F,
) where
F: FnOnce() -> Utf8PathBuf,
@@ -200,6 +211,7 @@ impl SecondaryDetailTimeline {
touched.access_time,
local_path(),
));
resident_metric.add(touched.metadata.file_size);
}
}
}
@@ -255,16 +267,28 @@ impl SecondaryDetail {
}
}
#[cfg(feature = "testing")]
pub(crate) fn total_resident_size(&self) -> u64 {
self.timelines
.values()
.map(|tl| {
tl.on_disk_layers
.values()
.map(|v| v.metadata.file_size)
.sum::<u64>()
})
.sum::<u64>()
}
pub(super) fn evict_layer(
&mut self,
name: LayerName,
timeline_id: &TimelineId,
now: SystemTime,
resident_metric: &UIntGauge,
) -> Option<OnDiskState> {
let timeline = self.timelines.get_mut(timeline_id)?;
let removed = timeline.remove_layer(&name);
let removed = timeline.remove_layer(&name, resident_metric);
if removed.is_some() {
timeline.evicted_at.insert(name, now);
}
@@ -273,21 +297,52 @@ impl SecondaryDetail {
pub(super) fn remove_timeline(
&mut self,
_tenant_shard_id: &TenantShardId,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
resident_metric: &UIntGauge,
) {
self.timelines.remove(timeline_id);
let removed = self.timelines.remove(timeline_id);
if let Some(removed) = removed {
Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
}
}
pub(super) fn drain_timelines(
&mut self,
_tenant_shard_id: &TenantShardId,
tenant_shard_id: &TenantShardId,
resident_metric: &UIntGauge,
) {
for (timeline_id, removed) in self.timelines.drain() {
Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
}
}
fn clear_timeline_metrics(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
detail: SecondaryDetailTimeline,
resident_metric: &UIntGauge,
) {
resident_metric.sub(
detail
.on_disk_layers
.values()
.map(|l| l.metadata.file_size)
.sum(),
);
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let tenant_id = tenant_shard_id.tenant_id.to_string();
let timeline_id = timeline_id.to_string();
for op in StorageIoSizeOperation::VARIANTS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[
op,
tenant_id.as_str(),
shard_id.as_str(),
timeline_id.as_str(),
]);
}
}
/// Additionally returns the total number of layers, used for more stable relative access time
/// based eviction.
@@ -742,6 +797,7 @@ impl<'a> TenantDownloader<'a> {
tenant_shard_id,
last_heatmap,
timeline,
&self.secondary_state.resident_size_metric,
ctx,
)
.await;
@@ -864,7 +920,11 @@ impl<'a> TenantDownloader<'a> {
bytes_downloaded: 0,
};
// Also expose heatmap bytes_total as a metric
self.secondary_state
.heatmap_total_size_metric
.set(heatmap_stats.bytes);
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
let mut delete_layers = Vec::new();
let mut delete_timelines = Vec::new();
@@ -931,6 +991,7 @@ impl<'a> TenantDownloader<'a> {
detail.remove_timeline(
self.secondary_state.get_tenant_shard_id(),
delete_timeline,
&self.secondary_state.resident_size_metric,
);
}
}
@@ -949,7 +1010,7 @@ impl<'a> TenantDownloader<'a> {
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
continue;
};
timeline_state.remove_layer(&layer_name);
timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
}
for timeline_id in delete_timelines {
@@ -1016,7 +1077,7 @@ impl<'a> TenantDownloader<'a> {
.await
.ok_or_else(|| UpdateError::Cancelled)
.and_then(|x| x)
.inspect(|_|{} )
.inspect(|_| SECONDARY_MODE.download_heatmap.inc())
}
/// Download heatmap layers that are not present on local disk, or update their
@@ -1191,6 +1252,7 @@ impl<'a> TenantDownloader<'a> {
tenant_shard_id,
&timeline_id,
&t,
&self.secondary_state.resident_size_metric,
|| {
local_layer_path(
self.conf,
@@ -1302,6 +1364,7 @@ impl<'a> TenantDownloader<'a> {
progress.layers_downloaded += 1;
}
SECONDARY_MODE.download_layer.inc();
Ok(Some(layer))
}
@@ -1313,6 +1376,7 @@ async fn init_timeline_state(
tenant_shard_id: &TenantShardId,
last_heatmap: Option<&HeatMapTimeline>,
heatmap: &HeatMapTimeline,
resident_metric: &UIntGauge,
ctx: &RequestContext,
) -> SecondaryDetailTimeline {
let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
@@ -1416,6 +1480,7 @@ async fn init_timeline_state(
tenant_shard_id,
&heatmap.timeline_id,
remote_meta,
resident_metric,
|| file_path,
);
}

View File

@@ -20,6 +20,7 @@ use super::scheduler::{
};
use super::{CommandRequest, SecondaryTenantError, UploadCommand};
use crate::TEMP_FILE_SUFFIX;
use crate::metrics::SECONDARY_MODE;
use crate::tenant::Tenant;
use crate::tenant::config::AttachmentMode;
use crate::tenant::mgr::{GetTenantError, TenantManager};
@@ -220,10 +221,14 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
let started_at = Instant::now();
let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap.inc();
Some(uploaded)
}
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
@@ -232,8 +237,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
"Failed to upload heatmap for tenant {}: {e:#}",
tenant.get_tenant_shard_id(),
);
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap_errors.inc();
last_upload
}
Err(UploadHeatmapError::Cancelled) => {

View File

@@ -882,6 +882,14 @@ impl ImageLayerWriterInner {
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
.inc_by(self.uncompressed_bytes_eligible);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();
// Write out the index

View File

@@ -32,6 +32,7 @@ use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
use crate::tenant::timeline::GetVectoredError;
@@ -306,7 +307,11 @@ impl GlobalResourceUnits {
}
};
// This is a sloppy update: concurrent updates to the counter will race, and the exact
// value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
// That's okay: as long as the metric contains some recent value, it doesn't have to always
// be literally the last update.
TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
self.dirty_bytes = size;

View File

@@ -231,7 +231,9 @@ impl Layer {
debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
timeline
.metrics
.resident_physical_size_add(metadata.file_size);
ResidentLayer { downloaded, owner }
}
@@ -524,6 +526,12 @@ impl Layer {
}
}
// Update the timeline's visible bytes count
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.add(self.0.desc.file_size)
}
}
}
@@ -532,10 +540,23 @@ impl Layer {
use LayerVisibilityHint::*;
match (old_visibility, visibility) {
(Visible, Covered) => {
// Subtract this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
debug_assert!(
tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
);
tl.metrics
.visible_physical_size_gauge
.sub(self.0.desc.file_size)
}
}
(Covered, Visible) => {
// Add this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.add(self.0.desc.file_size)
}
}
(Covered, Covered) | (Visible, Visible) => {
// no change
@@ -588,6 +609,7 @@ impl ResidentOrWantedEvicted {
ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
Some(strong) => {
LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
*self = ResidentOrWantedEvicted::Resident(strong.clone());
@@ -719,8 +741,17 @@ enum Status {
impl Drop for LayerInner {
fn drop(&mut self) {
let timeline: Option<Arc<Timeline>> = self.timeline.upgrade();
// if there was a pending eviction, mark it cancelled here to balance metrics
if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
{
// eviction has already been started
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
// eviction request is intentionally not honored as no one is present to wait for it
// and we could be delaying shutdown for nothing.
}
let timeline = self.timeline.upgrade();
if let Some(timeline) = timeline.as_ref() {
// Only need to decrement metrics if the timeline still exists: otherwise
@@ -728,6 +759,13 @@ impl Drop for LayerInner {
timeline.metrics.dec_layer(&self.desc);
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
debug_assert!(
timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
);
timeline
.metrics
.visible_physical_size_gauge
.sub(self.desc.file_size);
}
}
@@ -739,6 +777,7 @@ impl Drop for LayerInner {
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().layer_name();
let file_size = self.layer_desc().file_size;
let meta = self.metadata();
let status = self.status.take();
@@ -747,13 +786,20 @@ impl Drop for LayerInner {
// carry this until we are finished for [`Layer::wait_drop`] support
let _status = status;
let Some(timeline) = timeline else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
return;
};
match std::fs::remove_file(path) {
let Ok(_guard) = timeline.gate.enter() else {
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
return;
};
let removed = match std::fs::remove_file(path) {
Ok(()) => true,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// until we no longer do detaches by removing all local files before removing the
@@ -764,16 +810,34 @@ impl Drop for LayerInner {
// layers.
false
}
Err(_e) => {
Err(e) => {
tracing::error!("failed to remove wanted deleted layer: {e}");
LAYER_IMPL_METRICS.inc_delete_removes_failed();
false
}
};
let _a=timeline
if removed {
timeline.metrics.resident_physical_size_sub(file_size);
}
let res = timeline
.remote_client
.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_deletes();
}
});
}
}
@@ -804,6 +868,12 @@ impl LayerInner {
// This object acts as a RAII guard on these metrics: increment on construction
timeline.metrics.inc_layer(&desc);
// New layers are visible by default. This metric is later updated on drop or in set_visibility
timeline
.metrics
.visible_physical_size_gauge
.add(desc.file_size);
LayerInner {
conf,
path: local_path,
@@ -824,9 +894,13 @@ impl LayerInner {
}
fn delete_on_drop(&self) {
let _a=self.wanted_deleted
let res =
self.wanted_deleted
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
if res.is_ok() {
LAYER_IMPL_METRICS.inc_started_deletes();
}
}
/// Cancellation safe, however dropping the future and calling this method again might result
@@ -864,6 +938,12 @@ impl LayerInner {
// drop the DownloadedLayer outside of the holding the guard
drop(strong);
// idea here is that only one evicter should ever get to witness a strong reference,
// which means whenever get_or_maybe_download upgrades a weak, it must mark up a
// cancelled eviction and signal us, like it currently does.
//
// a second concurrent evict_and_wait will not see a strong reference.
LAYER_IMPL_METRICS.inc_started_evictions();
}
let changed = rx.changed();
@@ -903,13 +983,15 @@ impl LayerInner {
// get_or_init_detached can:
// - be fast (mutex lock) OR uncontested semaphore permit acquire
// - be slow (wait for semaphore permit or closing)
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
let locked = self
.inner
.get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
.await
.map(|mut guard| guard.get_and_upgrade().ok_or(guard));
scopeguard::ScopeGuard::into_inner(init_cancelled);
match locked {
// this path could had been a RwLock::read
@@ -922,7 +1004,8 @@ impl LayerInner {
// note that we also have dropped the Guard; this is fine, because we just made
// a state change and are holding a strong reference to be returned.
self.status.as_ref().unwrap().send_replace(Status::Resident);
LAYER_IMPL_METRICS
.inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
return Ok(strong);
}
@@ -949,7 +1032,8 @@ impl LayerInner {
.upgrade()
.ok_or(DownloadError::TimelineShutdown)?;
// count cancellations, which currently remain largely unexpected
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
// check if we really need to be downloaded: this can happen if a read access won the
// semaphore before eviction.
@@ -961,6 +1045,7 @@ impl LayerInner {
.await
.map_err(DownloadError::PreStatFailed);
scopeguard::ScopeGuard::into_inner(init_cancelled);
let needs_download = needs_download?;
@@ -971,7 +1056,7 @@ impl LayerInner {
self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
.await?;
LAYER_IMPL_METRICS.inc_init_needed_no_download();
return Ok(self.initialize_after_layer_is_on_disk(permit));
};
@@ -1012,13 +1097,13 @@ impl LayerInner {
async move {
tracing::info!(%reason, "downloading on-demand");
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
let res = self
.download_init_and_wait(timeline, permit, ctx.attached_child())
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
.await?;
scopeguard::ScopeGuard::into_inner(init_cancelled);
Ok(res)
}
.instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -1036,6 +1121,7 @@ impl LayerInner {
"unexpectedly on-demand downloading for task kind {:?}",
ctx.task_kind()
);
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
let really_error =
matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
@@ -1087,7 +1173,20 @@ impl LayerInner {
let res = this.download_and_init(timeline, permit, &ctx).await;
let _a =tx.send(res);
if let Err(res) = tx.send(res) {
match res {
Ok(_res) => {
tracing::debug!("layer initialized, but caller has been cancelled");
LAYER_IMPL_METRICS.inc_init_completed_without_requester();
}
Err(e) => {
tracing::info!(
"layer file download failed, and caller has been cancelled: {e:?}"
);
LAYER_IMPL_METRICS.inc_download_failed_without_requester();
}
}
}
}
.in_current_span(),
);
@@ -1139,9 +1238,21 @@ impl LayerInner {
}
};
tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
timeline
.metrics
.resident_physical_size_add(self.desc.file_size);
self.consecutive_failures.store(0, Ordering::Relaxed);
let since_last_eviction = self
.last_evicted_at
.lock()
.unwrap()
.take()
.map(|ts| ts.elapsed());
if let Some(since_last_eviction) = since_last_eviction {
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
}
self.access_stats.record_residence_event();
Ok(self.initialize_after_layer_is_on_disk(permit))
@@ -1296,8 +1407,12 @@ impl LayerInner {
tracing::debug!("eviction started");
let _a = self.wait_for_turn_and_evict(only_version).await;
let res = self.wait_for_turn_and_evict(only_version).await;
// metrics: ignore the Ok branch, it is not done yet
if let Err(e) = res {
tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
}
};
Self::spawn(start_evicting.instrument(span));
@@ -1417,13 +1532,21 @@ impl LayerInner {
Self::spawn_blocking(move || {
let _span = span.entered();
let res = self.evict_blocking( &gate, &permit);
let res = self.evict_blocking(&timeline, &gate, &permit);
let waiters = self.inner.initializer_count();
if waiters > 0 {
LAYER_IMPL_METRICS.inc_evicted_with_waiters();
}
let completed_in = spawned_at.elapsed();
LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
match res {
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
}
tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
});
@@ -1434,6 +1557,7 @@ impl LayerInner {
/// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
fn evict_blocking(
&self,
timeline: &Timeline,
_gate: &gate::GateGuard,
_permit: &heavier_once_cell::InitPermit,
) -> Result<(), EvictionCancelled> {
@@ -1446,7 +1570,17 @@ impl LayerInner {
Ok(elapsed) => {
let accessed_and_visible = self.access_stats.accessed()
&& self.access_stats.visibility() == LayerVisibilityHint::Visible;
if accessed_and_visible {
// Only layers used for reads contribute to our "low residence" metric that is used
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
// to be rapidly evicted without contributing to this metric.
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
}
tracing::info!(
residence_millis = elapsed.as_millis(),
@@ -1458,6 +1592,10 @@ impl LayerInner {
tracing::info!("evicted layer after unknown residence period");
}
}
timeline.metrics.evictions.inc();
timeline
.metrics
.resident_physical_size_sub(self.desc.file_size);
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
tracing::error!(
@@ -1674,7 +1812,13 @@ impl DownloadedLayer {
match res {
Ok(layer) => Ok(layer),
Err(err) => {
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
// We log this message once over the lifetime of `Self`
// => Ok and good to log backtrace and path here.
tracing::error!(
"layer load failed, assuming permanent failure: {}: {err:?}",
owner.path
);
Err(err)
}
}
@@ -1882,6 +2026,218 @@ impl From<ResidentLayer> for Layer {
}
}
use metrics::IntCounter;
pub(crate) struct LayerImplMetrics {
started_evictions: IntCounter,
completed_evictions: IntCounter,
cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
started_deletes: IntCounter,
completed_deletes: IntCounter,
failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
redownload_after: metrics::Histogram,
time_to_evict: metrics::Histogram,
}
impl Default for LayerImplMetrics {
fn default() -> Self {
use enum_map::Enum;
// reminder: these will be pageserver_layer_* with "_total" suffix
let started_evictions = metrics::register_int_counter!(
"pageserver_layer_started_evictions",
"Evictions started in the Layer implementation"
)
.unwrap();
let completed_evictions = metrics::register_int_counter!(
"pageserver_layer_completed_evictions",
"Evictions completed in the Layer implementation"
)
.unwrap();
let cancelled_evictions = metrics::register_int_counter_vec!(
"pageserver_layer_cancelled_evictions_count",
"Different reasons for evictions to have been cancelled or failed",
&["reason"]
)
.unwrap();
let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = EvictionCancelled::from_usize(i);
let s = reason.as_str();
cancelled_evictions.with_label_values(&[s])
}));
let started_deletes = metrics::register_int_counter!(
"pageserver_layer_started_deletes",
"Deletions on drop pending in the Layer implementation"
)
.unwrap();
let completed_deletes = metrics::register_int_counter!(
"pageserver_layer_completed_deletes",
"Deletions on drop completed in the Layer implementation"
)
.unwrap();
let failed_deletes = metrics::register_int_counter_vec!(
"pageserver_layer_failed_deletes_count",
"Different reasons for deletions on drop to have failed",
&["reason"]
)
.unwrap();
let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = DeleteFailed::from_usize(i);
let s = reason.as_str();
failed_deletes.with_label_values(&[s])
}));
let rare_counters = metrics::register_int_counter_vec!(
"pageserver_layer_assumed_rare_count",
"Times unexpected or assumed rare event happened",
&["event"]
)
.unwrap();
let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let event = RareEvent::from_usize(i);
let s = event.as_str();
rare_counters.with_label_values(&[s])
}));
let inits_cancelled = metrics::register_int_counter!(
"pageserver_layer_inits_cancelled_count",
"Times Layer initialization was cancelled",
)
.unwrap();
let redownload_after = {
let minute = 60.0;
let hour = 60.0 * minute;
metrics::register_histogram!(
"pageserver_layer_redownloaded_after",
"Time between evicting and re-downloading.",
vec![
10.0,
30.0,
minute,
5.0 * minute,
15.0 * minute,
30.0 * minute,
hour,
12.0 * hour,
]
)
.unwrap()
};
let time_to_evict = metrics::register_histogram!(
"pageserver_layer_eviction_held_permit_seconds",
"Time eviction held the permit.",
vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
)
.unwrap();
Self {
started_evictions,
completed_evictions,
cancelled_evictions,
started_deletes,
completed_deletes,
failed_deletes,
rare_counters,
inits_cancelled,
redownload_after,
time_to_evict,
}
}
}
impl LayerImplMetrics {
fn inc_started_evictions(&self) {
self.started_evictions.inc();
}
fn inc_completed_evictions(&self) {
self.completed_evictions.inc();
}
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
self.cancelled_evictions[reason].inc()
}
fn inc_started_deletes(&self) {
self.started_deletes.inc();
}
fn inc_completed_deletes(&self) {
self.completed_deletes.inc();
}
fn inc_deletes_failed(&self, reason: DeleteFailed) {
self.failed_deletes[reason].inc();
}
/// Counted separatedly from failed layer deletes because we will complete the layer deletion
/// attempt regardless of failure to delete local file.
fn inc_delete_removes_failed(&self) {
self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
}
/// Expected rare just as cancellations are rare, but we could have cancellations separate from
/// the single caller which can start the download, so use this counter to separte them.
fn inc_init_completed_without_requester(&self) {
self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
}
/// Expected rare because cancellations are unexpected, and failures are unexpected
fn inc_download_failed_without_requester(&self) {
self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
}
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
///
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
/// Option.
fn inc_raced_wanted_evicted_accesses(&self) {
self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
}
/// These are only expected for [`Self::inc_init_cancelled`] amount when
/// running with remote storage.
fn inc_init_needed_no_download(&self) {
self.rare_counters[RareEvent::InitWithoutDownload].inc();
}
/// Expected rare because all layer files should be readable and good
fn inc_permanent_loading_failures(&self) {
self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
}
fn inc_init_cancelled(&self) {
self.inits_cancelled.inc()
}
fn record_redownloaded_after(&self, duration: std::time::Duration) {
self.redownload_after.observe(duration.as_secs_f64())
}
/// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
/// instead cancel eviction if we would have read waiters. We cannot however separate reads
/// from other evictions, so this could have noise as well.
fn inc_evicted_with_waiters(&self) {
self.rare_counters[RareEvent::EvictedWithWaiters].inc();
}
/// Recorded at least initially as the permit is now acquired in async context before
/// spawn_blocking action.
fn record_time_to_evict(&self, duration: std::time::Duration) {
self.time_to_evict.observe(duration.as_secs_f64())
}
}
#[derive(Debug, Clone, Copy, enum_map::Enum)]
enum EvictionCancelled {
@@ -1898,6 +2254,21 @@ enum EvictionCancelled {
UnexpectedEvictedState,
}
impl EvictionCancelled {
fn as_str(&self) -> &'static str {
match self {
EvictionCancelled::LayerGone => "layer_gone",
EvictionCancelled::TimelineGone => "timeline_gone",
EvictionCancelled::VersionCheckFailed => "version_check_fail",
EvictionCancelled::FileNotFound => "file_not_found",
EvictionCancelled::RemoveFailed => "remove_failed",
EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
EvictionCancelled::LostToDownload => "lost_to_download",
EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
}
}
}
#[derive(enum_map::Enum)]
enum DeleteFailed {
@@ -1905,6 +2276,15 @@ enum DeleteFailed {
DeleteSchedulingFailed,
}
impl DeleteFailed {
fn as_str(&self) -> &'static str {
match self {
DeleteFailed::TimelineGone => "timeline_gone",
DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
}
}
}
#[derive(enum_map::Enum)]
enum RareEvent {
RemoveOnDropFailed,
@@ -1916,3 +2296,21 @@ enum RareEvent {
EvictedWithWaiters,
}
impl RareEvent {
fn as_str(&self) -> &'static str {
use RareEvent::*;
match self {
RemoveOnDropFailed => "remove_on_drop_failed",
InitCompletedWithoutRequester => "init_completed_without",
DownloadFailedWithoutRequester => "download_failed_without",
UpgradedWantedEvicted => "raced_wanted_evicted",
InitWithoutDownload => "init_needed_no_download",
PermanentLoadingFailure => "permanent_loading_failure",
EvictedWithWaiters => "evicted_with_waiters",
}
}
}
pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
once_cell::sync::Lazy::new(LayerImplMetrics::default);

View File

@@ -238,7 +238,7 @@ async fn smoke_test() {
rtc.get_remote_physical_size(),
dummy_layer.metadata().file_size
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -311,6 +311,11 @@ async fn evict_and_wait_on_wanted_deleted() {
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
/// This test ensures we are able to read the layer while the layer eviction has been
@@ -361,7 +366,7 @@ fn read_wins_pending_eviction() {
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
let (completion, barrier) = utils::completion::channel();
let (arrival, arrived_at_barrier) = utils::completion::channel();
@@ -393,7 +398,18 @@ fn read_wins_pending_eviction() {
// works as intended: evictions lose to "downloads"
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// this is not wrong: the eviction is technically still "on the way" as it's still queued
// because of a failpoint
assert_eq!(
0,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
drop(completion);
@@ -401,9 +417,26 @@ fn read_wins_pending_eviction() {
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
.await;
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// now we finally can observe the original eviction failing
// it would had been possible to observe it earlier, but here it is guaranteed to have
// happened.
assert_eq!(
1,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
});
}
@@ -466,7 +499,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
let (completion1, barrier) = utils::completion::channel();
let mut completion1 = Some(completion1);
@@ -501,9 +534,20 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
// works as intended: evictions lose to "downloads"
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// this is not wrong: the eviction is technically still "on the way" as it's still queued
// because of a failpoint
assert_eq!(
0,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// configure another failpoint for the second eviction -- evictions are per initialization,
// so now that we've reinitialized the inner, we get to run two of them at the same time.
@@ -523,10 +567,13 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
arrived_at_barrier.wait().await;
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
let mut release_earlier_eviction = |_expected_reason| {
let mut release_earlier_eviction = |expected_reason| {
assert_eq!(
0,
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
);
drop(completion1.take().unwrap());
@@ -539,7 +586,10 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
)
.await;
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
);
}
};
@@ -562,7 +612,19 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
.expect("eviction goes through now that spawn_blocking is unclogged")
.expect("eviction should succeed, because version matches");
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
// ensure the cancelled are unchanged
assert_eq!(
1,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
});
}
@@ -652,7 +714,8 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
.unwrap_err();
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
// failpoint is not counted as cancellation either
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
#[tokio::test(start_paused = true)]
@@ -829,7 +892,8 @@ async fn eviction_cancellation_on_drop() {
.expect_err("should had been a timeout since we are holding the layer resident");
}
// 1 == we only evict one of the layers
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
drop(resident);
@@ -838,7 +902,10 @@ async fn eviction_cancellation_on_drop() {
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
);
}
}

View File

@@ -10,7 +10,7 @@ use std::time::{Duration, Instant};
use once_cell::sync::Lazy;
use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
use rand::Rng;
use scopeguard::defer;
use tokio::sync::{Semaphore, SemaphorePermit};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -19,6 +19,7 @@ use utils::completion::Barrier;
use utils::pausable_failpoint;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
@@ -84,15 +85,17 @@ pub(crate) enum BackgroundLoopKind {
SecondaryDownload,
}
pub struct BackgroundLoopSemaphorePermit {
pub struct BackgroundLoopSemaphorePermit<'a> {
_permit: SemaphorePermit<'static>,
_recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
}
/// Acquires a semaphore permit, to limit concurrent background jobs.
pub(crate) async fn acquire_concurrency_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
) -> BackgroundLoopSemaphorePermit {
) -> BackgroundLoopSemaphorePermit<'static> {
let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
pausable_failpoint!("initial-size-calculation-permit-pause");
@@ -105,9 +108,11 @@ pub(crate) async fn acquire_concurrency_permit(
};
let permit = semaphore.acquire().await.expect("should never close");
recorder.acquired();
BackgroundLoopSemaphorePermit {
_permit: permit,
_recorder: recorder,
}
}
@@ -130,7 +135,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
compaction_loop(tenant, cancel)
// If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -155,6 +161,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
@@ -178,7 +186,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
tenant_housekeeping_loop(tenant, cancel)
.instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
@@ -583,5 +592,8 @@ pub(crate) fn warn_when_period_overrun(
?task,
"task iteration took longer than the configured period"
);
metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
.inc();
}
}

View File

@@ -45,8 +45,8 @@ pub struct Stats {
}
pub enum ThrottleResult {
NotThrottled { },
Throttled { },
NotThrottled { end: Instant },
Throttled { end: Instant },
}
impl Throttle {
@@ -114,7 +114,7 @@ impl Throttle {
let inner = self.inner.load_full(); // clones the `Inner` Arc
if !inner.enabled {
return ThrottleResult::NotThrottled { };
return ThrottleResult::NotThrottled { end: start };
}
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
@@ -127,9 +127,9 @@ impl Throttle {
let wait_time = end - start;
self.sum_throttled_usecs
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
ThrottleResult::Throttled { }
ThrottleResult::Throttled { end }
} else {
ThrottleResult::NotThrottled { }
ThrottleResult::NotThrottled { end: start }
}
}
}

View File

@@ -96,6 +96,7 @@ use super::{
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
debug_assert_current_span_has_tenant_and_timeline_id,
};
use crate::aux_file::AuxFileSizeEstimator;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -103,7 +104,10 @@ use crate::context::{
use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::l0_flush::{self, L0FlushGlobalState};
use crate::metrics::TimelineMetrics;
use crate::metrics::{
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
};
use crate::page_service::TenantManagerTypes;
use crate::pgdatadir_mapping::{
CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp,
@@ -411,6 +415,8 @@ pub struct Timeline {
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
/// Size estimator for aux file v2
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
/// Some test cases directly place keys into the timeline without actually modifying the directory
/// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
@@ -1204,20 +1210,20 @@ impl Timeline {
ctx.task_kind(),
);
// let start = crate::metrics::GET_VECTORED_LATENCY
// .for_task_kind(ctx.task_kind())
// .map(|metric| (metric, Instant::now()));
let start = crate::metrics::GET_VECTORED_LATENCY
.for_task_kind(ctx.task_kind())
.map(|metric| (metric, Instant::now()));
self
let res = self
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
.await
.await;
// if let Some((metric, start)) = start {
// let elapsed = start.elapsed();
// metric.observe(elapsed.as_secs_f64());
// }
if let Some((metric, start)) = start {
let elapsed = start.elapsed();
metric.observe(elapsed.as_secs_f64());
}
res
}
/// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
@@ -1256,21 +1262,21 @@ impl Timeline {
}
}
// let start = crate::metrics::SCAN_LATENCY
// .for_task_kind(ctx.task_kind())
// .map(ScanLatencyOngoingRecording::start_recording);
let start = crate::metrics::SCAN_LATENCY
.for_task_kind(ctx.task_kind())
.map(ScanLatencyOngoingRecording::start_recording);
let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
self
let vectored_res = self
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
.await
.await;
// if let Some(recording) = start {
// recording.observe();
// }
if let Some(recording) = start {
recording.observe();
}
vectored_res
}
pub(super) async fn get_vectored_impl(
@@ -1379,7 +1385,7 @@ impl Timeline {
return (key, Err(err));
}
};
DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
// The walredo module expects the records to be descending in terms of Lsn.
// And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1417,21 +1423,42 @@ impl Timeline {
// when they're missing. Instead they are omitted from the resulting btree
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() && layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
let total_keyspace = query.total_keyspace();
let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
if !results.is_empty() {
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
let total_keyspace = query.total_keyspace();
let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
static LOG_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
LOG_PACER.lock().unwrap().call(|| {
let num_keys = total_keyspace.total_raw_size();
let num_pages = results.len();
tracing::info!(
shard_id = %self.tenant_shard_id.shard_slug(),
lsn = %max_request_lsn,
"Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
);
});
static LOG_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
LOG_PACER.lock().unwrap().call(|| {
let num_keys = total_keyspace.total_raw_size();
let num_pages = results.len();
tracing::info!(
shard_id = %self.tenant_shard_id.shard_slug(),
lsn = %max_request_lsn,
"Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
);
});
}
// Records the number of layers visited in a few different ways:
//
// * LAYERS_PER_READ: all layers count towards every read in the batch, because each
// layer directly affects its observed latency.
//
// * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
// layer visits and access cost.
//
// * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
// read amplification after batching.
let layers_visited = layers_visited as f64;
let avg_layers_visited = layers_visited / results.len() as f64;
LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
for _ in &results {
self.metrics.layers_per_read.observe(layers_visited);
LAYERS_PER_READ_GLOBAL.observe(layers_visited);
LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
}
}
Ok(results)
@@ -1483,6 +1510,9 @@ impl Timeline {
guard.layer_size_sum()
}
pub(crate) fn resident_physical_size(&self) -> u64 {
self.metrics.resident_physical_size_get()
}
pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
@@ -1545,6 +1575,8 @@ impl Timeline {
WaitLsnTimeout::Default => self.conf.wait_lsn_timeout,
};
let timer = crate::metrics::WAIT_LSN_TIME.start_timer();
let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard();
let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout);
let wait_for_timeout = std::pin::pin!(wait_for_timeout);
@@ -1561,8 +1593,11 @@ impl Timeline {
ready,
is_slow,
elapsed_total,
elapsed_since_last_callback: _,
elapsed_since_last_callback,
}| {
self.metrics
.wait_lsn_in_progress_micros
.inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap());
if !is_slow {
return;
}
@@ -1592,6 +1627,8 @@ impl Timeline {
let res = wait_for_timeout.await;
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(logging_permit);
drop(start_finish_counterpair_guard);
drop(timer);
match res {
Ok(()) => Ok(()),
Err(e) => {
@@ -2453,12 +2490,11 @@ impl Timeline {
tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
pub(crate) fn get_lazy_slru_download(&self, lazy_slru_download_enabled_by_cp: bool) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.lazy_slru_download
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
tenant_conf.tenant_conf.lazy_slru_download.unwrap_or(
lazy_slru_download_enabled_by_cp || self.conf.default_tenant_conf.lazy_slru_download,
)
}
/// Checks if a get page request should get perf tracing
@@ -2665,14 +2701,6 @@ impl Timeline {
.clone()
}
pub fn get_compaction_shard_ancestor(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_shard_ancestor
.unwrap_or(self.conf.default_tenant_conf.compaction_shard_ancestor)
}
fn get_eviction_policy(&self) -> EvictionPolicy {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2681,6 +2709,15 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
}
fn get_evictions_low_residence_duration_metric_threshold(
tenant_conf: &pageserver_api::models::TenantConfig,
default_tenant_conf: &pageserver_api::config::TenantConfigToml,
) -> Duration {
tenant_conf
.evictions_low_residence_duration_metric_threshold
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2756,8 +2793,28 @@ impl Timeline {
// The threshold is embedded in the metric. So, we need to update it.
{
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
&new_conf.tenant_conf,
&self.conf.default_tenant_conf,
);
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
let timeline_id_str = self.timeline_id.to_string();
self.remote_client.update_config(&new_conf.location);
self.metrics
.evictions_with_low_residence_duration
.write()
.unwrap()
.change_threshold(
&tenant_id_str,
&shard_id_str,
&timeline_id_str,
new_threshold,
);
}
}
@@ -2791,6 +2848,13 @@ impl Timeline {
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let evictions_low_residence_duration_metric_threshold = {
let loaded_tenant_conf = tenant_conf.load();
Self::get_evictions_low_residence_duration_metric_threshold(
&loaded_tenant_conf.tenant_conf,
&conf.default_tenant_conf,
)
};
if let Some(ancestor) = &ancestor {
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
@@ -2803,7 +2867,12 @@ impl Timeline {
let metrics = Arc::new(TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
));
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
let mut result = Timeline {
conf,
@@ -2911,6 +2980,8 @@ impl Timeline {
pagestream_throttle: resources.pagestream_throttle,
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
#[cfg(test)]
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
@@ -2936,6 +3007,10 @@ impl Timeline {
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result
.metrics
.last_record_lsn_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
}
@@ -3091,6 +3166,8 @@ impl Timeline {
let mut guard = self.layers.write().await;
let timer = self.metrics.load_layer_map_histo.start_timer();
// Scan timeline directory and create ImageLayerName and DeltaFilename
// structs representing all files on disk
let timeline_path = self
@@ -3251,6 +3328,7 @@ impl Timeline {
num_layers, disk_consistent_lsn, total_physical_size
);
timer.stop_and_record();
Ok(())
}
@@ -3318,7 +3396,7 @@ impl Timeline {
if let CurrentLogicalSize::Approximate(_) = &current_size {
if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
let _= self
let first = self
.current_logical_size
.did_return_approximate_to_walreceiver
.compare_exchange(
@@ -3326,8 +3404,11 @@ impl Timeline {
true,
AtomicOrdering::Relaxed,
AtomicOrdering::Relaxed,
).is_ok();
)
.is_ok();
if first {
crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
}
}
}
@@ -3399,7 +3480,7 @@ impl Timeline {
self.current_logical_size.initialized.add_permits(1);
}
let try_once = |_attempt: usize| {
let try_once = |attempt: usize| {
let background_ctx = &background_ctx;
let self_ref = &self;
let skip_concurrency_limiter = &skip_concurrency_limiter;
@@ -3410,7 +3491,7 @@ impl Timeline {
);
use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, _circumstances) = tokio::select! {
let (_maybe_permit, circumstances) = tokio::select! {
permit = wait_for_permit => {
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
}
@@ -3427,6 +3508,12 @@ impl Timeline {
}
};
let metrics_guard = if attempt == 1 {
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
} else {
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
};
let io_concurrency = IoConcurrency::spawn_from_conf(
self_ref.conf,
self_ref
@@ -3453,7 +3540,7 @@ impl Timeline {
// TODO: add aux file size to logical size
Ok(calculated_size)
Ok((calculated_size, metrics_guard))
}
};
@@ -3490,14 +3577,27 @@ impl Timeline {
}
};
let calculated_size = match retrying.await {
let (calculated_size, metrics_guard) = match retrying.await {
ControlFlow::Continue(calculated_size) => calculated_size,
ControlFlow::Break(()) => return,
};
// we cannot query current_logical_size.current_size() to know the current
// *negative* value, only truncated to u64.
let added = self
.current_logical_size
.size_added_after_initial
.load(AtomicOrdering::Relaxed);
let sum = calculated_size.saturating_add_signed(added);
// set the gauge value before it can be set in `update_current_logical_size`.
self.metrics.current_logical_size_gauge.set(sum);
self.current_logical_size
.initial_logical_size
.set((calculated_size,))
.set((calculated_size, metrics_guard.calculation_result_saved()))
.ok()
.expect("only this task sets it");
}
@@ -3562,7 +3662,7 @@ impl Timeline {
async fn calculate_logical_size(
&self,
up_to_lsn: Lsn,
_cause: LogicalSizeCalculationCause,
cause: LogicalSizeCalculationCause,
_guard: &GateGuard,
ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> {
@@ -3581,13 +3681,20 @@ impl Timeline {
if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
return Ok(size);
}
let storage_time_metrics = match cause {
LogicalSizeCalculationCause::Initial
| LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
| LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
LogicalSizeCalculationCause::EvictionTaskImitation => {
&self.metrics.imitate_logical_size_histo
}
};
let timer = storage_time_metrics.start_timer();
let logical_size = self
.get_current_logical_size_non_incremental(up_to_lsn, ctx)
.await?;
debug!("calculated logical size: {logical_size}");
timer.stop_and_record();
Ok(logical_size)
}
@@ -3596,6 +3703,21 @@ impl Timeline {
let logical_size = &self.current_logical_size;
logical_size.increment_size(delta);
// Also set the value in the prometheus gauge. Note that
// there is a race condition here: if this is is called by two
// threads concurrently, the prometheus gauge might be set to
// one value while current_logical_size is set to the
// other.
match logical_size.current_size() {
CurrentLogicalSize::Exact(ref new_current_size) => self
.metrics
.current_logical_size_gauge
.set(new_current_size.into()),
CurrentLogicalSize::Approximate(_) => {
// don't update the gauge yet, this allows us not to update the gauge back and
// forth between the initial size calculation task.
}
}
}
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
@@ -3633,8 +3755,26 @@ impl Timeline {
}
};
// TODO: remove this, there's no place in the code that updates this aux metrics.
let aux_metric =
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
let sum_of_entries = self
.directory_metrics
.iter()
.map(|v| v.load(AtomicOrdering::Relaxed))
.sum();
// Set a high general threshold and a lower threshold for the auxiliary files,
// as we can have large numbers of relations in the db directory.
const SUM_THRESHOLD: u64 = 5000;
const AUX_THRESHOLD: u64 = 1000;
if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
self.metrics
.directory_entries_count_gauge
.set(sum_of_entries);
} else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
metric.set(sum_of_entries);
}
}
async fn find_layer(
@@ -4354,6 +4494,8 @@ impl Timeline {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}
@@ -4473,10 +4615,17 @@ impl Timeline {
"stalling layer flushes for compaction backpressure at {l0_count} \
L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
);
let stall_timer = self
.metrics
.flush_delay_histo
.start_timer()
.record_on_drop();
tokio::select! {
_result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
if let Ok(l0) = result.as_deref() {
let delay = stall_timer.elapsed().as_secs_f64();
info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s");
}
},
_ = self.cancel.cancelled() => {},
}
@@ -4485,7 +4634,7 @@ impl Timeline {
}
// Flush the layer.
let flush_timer = Instant::now();
let flush_timer = self.metrics.flush_time_histo.start_timer();
match self.flush_frozen_layer(layer, ctx).await {
Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn),
Err(FlushLayerError::Cancelled) => {
@@ -4501,7 +4650,7 @@ impl Timeline {
break err.map(|_| ());
}
}
let flush_duration = flush_timer.elapsed();
let flush_duration = flush_timer.stop_and_record();
// Notify the tenant compaction loop if L0 compaction is needed.
let l0_count = *watch_l0.borrow();
@@ -4519,7 +4668,11 @@ impl Timeline {
"delaying layer flush by {delay:.3}s for compaction backpressure at \
{l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
);
let _delay_timer = self
.metrics
.flush_delay_histo
.start_timer()
.record_on_drop();
tokio::select! {
_ = tokio::time::sleep(flush_duration) => {},
_ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {},
@@ -4761,6 +4914,9 @@ impl Timeline {
"disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"
);
self.metrics
.disk_consistent_lsn_gauge
.set(new_value.0 as i64);
new_value != old_value
}
@@ -5246,7 +5402,7 @@ impl Timeline {
last_status: LastImageLayerCreationStatus,
yield_for_l0: bool,
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
if partitioning.parts.is_empty() {
warn!("no partitions to create image layers for");
@@ -5279,8 +5435,8 @@ impl Timeline {
let mut all_generated = true;
let total_partitions = partitioning.parts.len();
let mut partition_processed = 0;
let mut total_partitions = partitioning.parts.len();
let mut last_partition_processed = None;
let mut partition_parts = partitioning.parts.clone();
@@ -5305,7 +5461,7 @@ impl Timeline {
break; // with found=false
}
partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
total_partitions = partition_parts.len();
// Update the start key to the partition start.
start = partition_parts[0].start().unwrap();
found = true;
@@ -5322,6 +5478,7 @@ impl Timeline {
if self.cancel.is_cancelled() {
return Err(CreateImageLayersError::Cancelled);
}
partition_processed += 1;
let img_range = start..partition.ranges.last().unwrap().end;
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
if compact_metadata {
@@ -5484,16 +5641,28 @@ impl Timeline {
.open_mut()?
.track_new_image_layers(&image_layers, &self.metrics);
drop_wlock(guard);
let duration = timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
if !image_layers.is_empty() {
self.update_layer_visibility().await?;
}
let total_layer_size = image_layers
.iter()
.map(|l| l.metadata().file_size)
.sum::<u64>();
if !image_layers.is_empty() {
info!(
"created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
image_layers.len(),
total_layer_size,
duration.as_secs_f64(),
partition_processed,
total_partitions
);
}
Ok((
image_layers,
@@ -6028,7 +6197,11 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<GcCutoffs, PageReconstructError> {
let _timer = self
.metrics
.find_gc_cutoffs_histo
.start_timer()
.record_on_drop();
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
@@ -6095,7 +6268,7 @@ impl Timeline {
guard = self.gc_lock.lock() => guard,
_ = self.cancel.cancelled() => return Ok(GcResult::default()),
};
let timer = self.metrics.garbage_collect_histo.start_timer();
fail_point!("before-timeline-gc");
@@ -6152,7 +6325,9 @@ impl Timeline {
// It is an easy way to unset it when standby disappears without adding
// more conf options.
self.standby_horizon.store(Lsn::INVALID);
self.metrics
.standby_horizon_gauge
.set(Lsn::INVALID.0 as i64);
let res = self
.gc_timeline(
@@ -6167,7 +6342,8 @@ impl Timeline {
)
.await?;
// only record successes
timer.stop_and_record();
Ok(res)
}

View File

@@ -1133,15 +1133,16 @@ impl Timeline {
// 1. L0 Compact
let l0_outcome = {
self
let timer = self.metrics.compact_time_histo.start_timer();
let l0_outcome = self
.compact_level0(
target_file_size,
options.flags.contains(CompactFlags::ForceL0Compaction),
ctx,
)
.await?
.await?;
timer.stop_and_record();
l0_outcome
};
if options.flags.contains(CompactFlags::OnlyL0Compaction) {
@@ -1238,7 +1239,8 @@ impl Timeline {
let partition_count = self.partitioning.read().0.0.parts.len();
// 4. Shard ancestor compaction
if self.get_compaction_shard_ancestor() && self.shard_identity.count >= ShardCount::new(2) {
if self.shard_identity.count >= ShardCount::new(2) {
// Limit the number of layer rewrites to the number of partitions: this means its
// runtime should be comparable to a full round of image layer creations, rather than
// being potentially much longer.
@@ -4040,7 +4042,7 @@ impl TimelineAdaptor {
key_range: &Range<Key>,
ctx: &RequestContext,
) -> Result<(), CreateImageLayersError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
let image_layer_writer = ImageLayerWriter::new(
self.timeline.conf,
@@ -4086,7 +4088,7 @@ impl TimelineAdaptor {
self.new_images.push(image_layer);
}
timer.stop_and_record();
Ok(())
}

View File

@@ -126,7 +126,7 @@ impl Timeline {
) -> ControlFlow<(), Instant> {
debug!("eviction iteration: {policy:?}");
let start = Instant::now();
let (period, _) = match policy {
let (period, threshold) = match policy {
EvictionPolicy::NoEviction => {
// check again in 10 seconds; XXX config watch mechanism
return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
@@ -159,6 +159,16 @@ impl Timeline {
period,
BackgroundLoopKind::Eviction,
);
// FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
// don't think that is a relevant fear however, and regardless the imitation should be the
// most costly part.
crate::metrics::EVICTION_ITERATION_DURATION
.get_metric_with_label_values(&[
&format!("{}", period.as_secs()),
&format!("{}", threshold.as_secs()),
])
.unwrap()
.observe(elapsed.as_secs_f64());
ControlFlow::Continue(start + period)
}
@@ -315,7 +325,7 @@ impl Timeline {
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<(), BackgroundLoopSemaphorePermit> {
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
let acquire_permit =
crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);
@@ -357,7 +367,7 @@ impl Timeline {
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
permit: BackgroundLoopSemaphorePermit,
permit: BackgroundLoopSemaphorePermit<'static>,
ctx: &RequestContext,
) -> ControlFlow<()> {
if !self.tenant_shard_id.is_shard_zero() {

View File

@@ -59,7 +59,11 @@ impl HeatmapLayersDownloader {
return;
};
tracing::info!(
resident_size=%timeline.resident_physical_size(),
heatmap_layers=%heatmap.all_layers().count(),
"Starting heatmap layers download"
);
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|layer| {
@@ -89,7 +93,7 @@ impl HeatmapLayersDownloader {
tokio::select! {
_ = stream.collect::<()>() => {
tracing::info!(
resident_size=%timeline.resident_physical_size(),
"Heatmap layers download completed"
);
},

View File

@@ -290,7 +290,7 @@ impl OpenLayerManager {
lsn: Lsn,
last_freeze_at: &AtomicLsn,
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
_metrics: &TimelineMetrics,
metrics: &TimelineMetrics,
) -> bool {
let Lsn(last_record_lsn) = lsn;
let end_lsn = Lsn(last_record_lsn + 1);
@@ -299,6 +299,10 @@ impl OpenLayerManager {
let open_layer_rc = Arc::clone(open_layer);
open_layer.freeze(end_lsn).await;
// Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
// TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
// reference to the timeline metrics. Other methods use a metrics borrow as well.
metrics.inc_frozen_layer(open_layer);
// The layer is no longer open, update the layer map to reflect this.
// We will replace it with on-disk historics below.
@@ -330,12 +334,16 @@ impl OpenLayerManager {
pub(crate) fn track_new_image_layers(
&mut self,
image_layers: &[ResidentLayer],
_metrics: &TimelineMetrics,
metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
for layer in image_layers {
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
// record these here instead of Layer::finish_creating because otherwise partial
// failure with create_image_layers would balloon up the physical size gauge. downside
// is that all layers need to be created before metrics are updated.
metrics.record_new_file_metrics(layer.layer_desc().file_size);
}
updates.flush();
}
@@ -345,13 +353,14 @@ impl OpenLayerManager {
&mut self,
delta_layer: Option<&ResidentLayer>,
frozen_layer_for_check: &Arc<InMemoryLayer>,
_metrics: &TimelineMetrics,
metrics: &TimelineMetrics,
) {
let inmem = self
.layer_map
.frozen_layers
.pop_front()
.expect("there must be a inmem layer to flush");
metrics.dec_frozen_layer(&inmem);
// Only one task may call this function at a time (for this
// timeline). If two tasks tried to flush the same frozen
@@ -361,6 +370,7 @@ impl OpenLayerManager {
if let Some(l) = delta_layer {
let mut updates = self.layer_map.batch_update();
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
updates.flush();
}
}
@@ -370,11 +380,12 @@ impl OpenLayerManager {
&mut self,
compact_from: &[Layer],
compact_to: &[ResidentLayer],
_metrics: &TimelineMetrics,
metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
for l in compact_to {
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
}
for l in compact_from {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
@@ -427,7 +438,7 @@ impl OpenLayerManager {
rewrite_layers: &[(Layer, ResidentLayer)],
drop_layers: &[Layer],
add_layers: &[ResidentLayer],
_metrics: &TimelineMetrics,
metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
for (old_layer, new_layer) in rewrite_layers {
@@ -458,12 +469,14 @@ impl OpenLayerManager {
&mut self.layer_fmgr,
);
metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
}
for l in drop_layers {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
}
for l in add_layers {
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
}
updates.flush();
}

View File

@@ -23,6 +23,7 @@ pub(super) struct LogicalSize {
/// the initial size at a different LSN.
pub initial_logical_size: OnceCell<(
u64,
crate::metrics::initial_logical_size::FinishedCalculationGuard,
)>,
/// Cancellation for the best-effort logical size calculation.
@@ -129,7 +130,11 @@ impl CurrentLogicalSize {
impl LogicalSize {
pub(super) fn empty_initial() -> Self {
Self {
initial_logical_size: OnceCell::with_value((0,)),
initial_logical_size: OnceCell::with_value((0, {
crate::metrics::initial_logical_size::START_CALCULATION
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
.calculation_result_saved()
})),
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
initial_part_end: None,
size_added_after_initial: AtomicI64::new(0),
@@ -154,7 +159,7 @@ impl LogicalSize {
// ^^^ keep this type explicit so that the casts in this function break if
// we change the type.
match self.initial_logical_size.get() {
Some((initial_size, )) => {
Some((initial_size, _)) => {
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
.unwrap()))
@@ -176,7 +181,7 @@ impl LogicalSize {
/// available for re-use. This doesn't contain the incremental part.
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
match self.initial_part_end {
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
_ => None,
}
}

View File

@@ -39,6 +39,10 @@ use utils::postgres_client::{
use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::TaskKind;
use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};
@@ -72,6 +76,11 @@ pub(super) async fn connection_manager_loop_step(
}
}
WALRECEIVER_ACTIVE_MANAGERS.inc();
scopeguard::defer! {
WALRECEIVER_ACTIVE_MANAGERS.dec();
}
let id = TenantTimelineId {
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
timeline_id: connection_manager_state.timeline.timeline_id,
@@ -517,6 +526,9 @@ impl ConnectionManagerState {
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
WALRECEIVER_SWITCHES
.with_label_values(&[new_sk.reason.name()])
.inc();
self.drop_old_connection(true).await;
@@ -719,6 +731,8 @@ impl ConnectionManagerState {
}
};
WALRECEIVER_BROKER_UPDATES.inc();
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
@@ -728,6 +742,10 @@ impl ConnectionManagerState {
self.timeline
.standby_horizon
.store(Lsn(timeline_update.standby_horizon));
self.timeline
.metrics
.standby_horizon_gauge
.set(timeline_update.standby_horizon as i64);
}
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -745,6 +763,7 @@ impl ConnectionManagerState {
%new_safekeeper_id,
"New SK node was added",
);
WALRECEIVER_CANDIDATES_ADDED.inc();
}
}
@@ -1032,6 +1051,7 @@ impl ConnectionManagerState {
"Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
);
self.wal_connection_retries.remove(&node_id);
WALRECEIVER_CANDIDATES_REMOVED.inc();
}
}
}
@@ -1058,7 +1078,6 @@ struct NewWalConnectionCandidate {
safekeeper_id: NodeId,
wal_source_connconf: PgConnectionConfig,
availability_zone: Option<String>,
#[allow(dead_code)]
reason: ReconnectReason,
}
@@ -1087,6 +1106,18 @@ enum ReconnectReason {
},
}
impl ReconnectReason {
fn name(&self) -> &str {
match self {
ReconnectReason::NoExistingConnection => "NoExistingConnection",
ReconnectReason::LaggingWal { .. } => "LaggingWal",
ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
}
}
}
#[cfg(test)]
mod tests {
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;

View File

@@ -36,6 +36,7 @@ use wal_decoder::wire_format::FromWireFormat;
use super::TaskStateUpdate;
use crate::context::RequestContext;
use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS};
use crate::pgdatadir_mapping::DatadirModification;
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
use crate::tenant::{
@@ -136,7 +137,7 @@ pub(super) async fn handle_walreceiver_connection(
GateError::GateClosed => WalReceiverError::ClosedGate,
})?;
WALRECEIVER_STARTED_CONNECTIONS.inc();
// Connect to the database in replication mode.
info!("connecting to {wal_source_connconf:?}");
@@ -222,6 +223,10 @@ pub(super) async fn handle_walreceiver_connection(
.instrument(tracing::info_span!("poller")),
);
let _guard = LIVE_CONNECTIONS
.with_label_values(&["wal_receiver"])
.guard();
let identify = identify_system(&replication_client).await?;
info!("{identify:?}");
@@ -339,7 +344,7 @@ pub(super) async fn handle_walreceiver_connection(
let status_update = match replication_message {
ReplicationMessage::RawInterpretedWalRecords(raw) => {
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
let mut uncommitted_records = 0;
@@ -412,13 +417,21 @@ pub(super) async fn handle_walreceiver_connection(
ctx: &RequestContext,
uncommitted: &mut u64,
) -> anyhow::Result<()> {
let stats = modification.stats();
modification.commit(ctx).await?;
WAL_INGEST.records_committed.inc_by(*uncommitted);
WAL_INGEST.inc_values_committed(&stats);
*uncommitted = 0;
Ok(())
}
if !records.is_empty() {
timeline
.metrics
.wal_records_received
.inc_by(records.len() as u64);
}
for interpreted in records {
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
@@ -428,6 +441,9 @@ pub(super) async fn handle_walreceiver_connection(
let local_next_record_lsn = interpreted.next_record_lsn;
if interpreted.is_observed() {
WAL_INGEST.records_observed.inc();
}
walingest
.ingest_record(interpreted, &mut modification, &ctx)
@@ -499,9 +515,12 @@ pub(super) async fn handle_walreceiver_connection(
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let stats = modification.stats();
modification.commit(ctx).await?;
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
WAL_INGEST.inc_values_committed(&stats);
*uncommitted = 0;
*filtered = 0;
Ok(())
@@ -515,7 +534,7 @@ pub(super) async fn handle_walreceiver_connection(
trace!("received XLogData between {startlsn} and {endlsn}");
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
waldecoder.feed_bytes(data);
{
@@ -557,6 +576,7 @@ pub(super) async fn handle_walreceiver_connection(
}
// Ingest the records without immediately committing them.
timeline.metrics.wal_records_received.inc();
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
@@ -572,7 +592,7 @@ pub(super) async fn handle_walreceiver_connection(
})?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}

View File

@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
use utils::vec_map::VecMap;
use crate::context::RequestContext;
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, Header};
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
use crate::virtual_file::{self, IoBufferMut, VirtualFile};
/// Metadata bundled with the start and end offset of a blob.
@@ -111,20 +111,18 @@ impl From<Bytes> for BufView<'_> {
pub struct VectoredBlob {
/// Blob metadata.
pub meta: BlobMeta,
/// Header start offset.
header_start: usize,
/// Data start offset.
data_start: usize,
/// Start offset.
start: usize,
/// End offset.
end: usize,
/// Compression used on the data, extracted from the header.
/// Compression used on the the blob.
compression_bits: u8,
}
impl VectoredBlob {
/// Reads a decompressed view of the blob.
pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
let view = buf.view(self.data_start..self.end);
let view = buf.view(self.start..self.end);
match self.compression_bits {
BYTE_UNCOMPRESSED => Ok(view),
@@ -142,19 +140,13 @@ impl VectoredBlob {
std::io::ErrorKind::InvalidData,
format!(
"Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}",
self.meta.key, self.meta.lsn, self.data_start, self.end
self.meta.key, self.meta.lsn, self.start, self.end
),
);
Err(error)
}
}
}
/// Returns the raw blob including header.
#[allow(unused)]
pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
buf.view(self.header_start..self.end)
}
}
impl std::fmt::Display for VectoredBlob {
@@ -162,7 +154,7 @@ impl std::fmt::Display for VectoredBlob {
write!(
f,
"{}@{}, {}..{}",
self.meta.key, self.meta.lsn, self.data_start, self.end
self.meta.key, self.meta.lsn, self.start, self.end
)
}
}
@@ -501,28 +493,50 @@ impl<'a> VectoredBlobReader<'a> {
let blobs_at = read.blobs_at.as_slice();
let mut blobs = Vec::with_capacity(blobs_at.len());
let start_offset = read.start;
let mut metas = Vec::with_capacity(blobs_at.len());
// Blobs in `read` only provide their starting offset. The end offset
// of a blob is implicit: the start of the next blob if one exists
// or the end of the read.
for (blob_start, meta) in blobs_at.iter().copied() {
let header_start = (blob_start - read.start) as usize;
let header = Header::decode(&buf[header_start..])?;
let data_start = header_start + header.header_len;
let end = data_start + header.data_len;
let compression_bits = header.compression_bits;
for (blob_start, meta) in blobs_at {
let blob_start_in_buf = blob_start - start_offset;
let first_len_byte = buf[blob_start_in_buf as usize];
blobs.push(VectoredBlob {
header_start,
data_start,
// Each blob is prefixed by a header containing its size and compression information.
// Extract the size and skip that header to find the start of the data.
// The size can be 1 or 4 bytes. The most significant bit is 0 in the
// 1 byte case and 1 in the 4 byte case.
let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
(1, first_len_byte as u64, BYTE_UNCOMPRESSED)
} else {
let mut blob_size_buf = [0u8; 4];
let offset_in_buf = blob_start_in_buf as usize;
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
(
4,
u32::from_be_bytes(blob_size_buf) as u64,
compression_bits,
)
};
let start = (blob_start_in_buf + size_length) as usize;
let end = start + blob_size as usize;
metas.push(VectoredBlob {
start,
end,
meta,
meta: *meta,
compression_bits,
});
}
Ok(VectoredBlobsBuf { buf, blobs })
Ok(VectoredBlobsBuf { buf, blobs: metas })
}
}
@@ -983,15 +997,6 @@ mod tests {
&read_buf[..],
"mismatch for idx={idx} at offset={offset}"
);
// Check that raw_with_header returns a valid header.
let raw = read_blob.raw_with_header(&view);
let header = Header::decode(&raw)?;
if !compression || header.header_len == 1 {
assert_eq!(header.compression_bits, BYTE_UNCOMPRESSED);
}
assert_eq!(raw.len(), header.total_len());
buf = result.buf;
}
Ok(())

View File

@@ -10,7 +10,7 @@ use pageserver_api::models::PageserverUtilization;
use utils::serde_percent::Percent;
use crate::config::PageServerConf;
use crate::metrics::NODE_UTILIZATION_SCORE;
use crate::tenant::mgr::TenantManager;
pub(crate) fn regenerate(
@@ -53,7 +53,7 @@ pub(crate) fn regenerate(
// Express a static value for how many shards we may schedule on one node
const MAX_SHARDS: u32 = 5000;
let doc = PageserverUtilization {
let mut doc = PageserverUtilization {
disk_usage_bytes: used,
free_space_bytes: free,
disk_wanted_bytes,
@@ -63,7 +63,10 @@ pub(crate) fn regenerate(
utilization_score: None,
captured_at: utils::serde_system_time::SystemTime(captured_at),
};
// Initialize `PageserverUtilization::utilization_score`
let score = doc.cached_score();
NODE_UTILIZATION_SCORE.set(score);
Ok(doc)
}

View File

@@ -27,9 +27,12 @@ use owned_buffers_io::io_buf_ext::FullSlice;
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
pub use pageserver_api::models::virtual_file as api;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
use crate::assert_u64_eq_usize::UsizeIsU64;
use crate::context::RequestContext;
use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
pub(crate) mod io_engine;
pub use io_engine::{
@@ -428,7 +431,9 @@ impl OpenFiles {
if let Some(old_file) = slot_guard.file.take() {
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
// distinguish the two.
drop(old_file);
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::CloseByReplace)
.observe_closure_duration(|| drop(old_file));
}
// Prepare the slot for reuse and return it
@@ -527,9 +532,13 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
/// where "support" means that we measure wall clock time.
macro_rules! observe_duration {
($op:expr, $($body:tt)*) => {{
$($body)*
let instant = Instant::now();
let result = $($body)*;
let elapsed = instant.elapsed().as_secs_f64();
STORAGE_IO_TIME_METRIC
.get($op)
.observe(elapsed);
result
}}
}
@@ -904,7 +913,7 @@ impl VirtualFileInner {
&self,
buf: tokio_epoll_uring::Slice<Buf>,
offset: u64,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
where
Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -921,7 +930,9 @@ impl VirtualFileInner {
observe_duration!(StorageIoOperation::Read, {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
if let Ok(size) = res {
ctx.io_size_metrics().read.add(size.into_u64());
}
(buf, res)
})
}
@@ -942,7 +953,7 @@ impl VirtualFileInner {
&self,
buf: FullSlice<B>,
offset: u64,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> (FullSlice<B>, Result<usize, Error>) {
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
@@ -951,7 +962,9 @@ impl VirtualFileInner {
observe_duration!(StorageIoOperation::Write, {
let ((_file_guard, buf), result) =
io_engine::get().write_at(file_guard, offset, buf).await;
if let Ok(size) = result {
ctx.io_size_metrics().write.add(size.into_u64());
}
(buf, result)
})
}
@@ -1250,7 +1263,9 @@ impl Drop for VirtualFileInner {
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
if let Some(fd) = slot_guard.file.take() {
drop(fd);
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(fd));
}
}
}
@@ -1319,6 +1334,7 @@ pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: Syn
set_io_mode(mode);
io_engine::init(engine);
SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;

View File

@@ -54,7 +54,14 @@ static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
pub(crate) fn set(engine_kind: IoEngineKind) {
let engine: IoEngine = engine_kind.into();
IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
#[cfg(not(test))]
{
let metric = &crate::metrics::virtual_file_io_engine::KIND;
metric.reset();
metric
.with_label_values(&[&format!("{engine_kind}")])
.set(1);
}
}
#[cfg(not(test))]

View File

@@ -82,7 +82,7 @@ pub async fn thread_local_system() -> Handle {
match res {
Ok(system) => {
info!("successfully launched system");
metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
Ok(system)
}
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
@@ -90,7 +90,7 @@ pub async fn thread_local_system() -> Handle {
info_span!("stats").in_scope(|| {
emit_launch_failure_process_stats();
});
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
Err(())
}

View File

@@ -46,6 +46,7 @@ use wal_decoder::models::*;
use crate::ZERO_PAGE;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::{DatadirModification, Version};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::{PageReconstructError, Timeline};
@@ -234,7 +235,7 @@ impl WalIngest {
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> Result<bool, WalIngestError> {
WAL_INGEST.records_received.inc();
let prev_len = modification.len();
modification.set_lsn(interpreted.next_record_lsn)?;
@@ -1442,6 +1443,9 @@ impl WalIngest {
gap_blocks_filled += 1;
}
WAL_INGEST
.gap_blocks_zeroed_on_rel_extend
.inc_by(gap_blocks_filled);
// Log something when relation extends cause use to fill gaps
// with zero pages. Logging is rate limited per pg version to

View File

@@ -40,7 +40,10 @@ use utils::sync::gate::GateError;
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
};
/// The real implementation that uses a Postgres process to
/// perform WAL replay.
@@ -350,7 +353,7 @@ impl PostgresRedoManager {
}
},
Err(permit) => {
let start = Instant::now();
// acquire guard before spawning process, so that we don't spawn new processes
// if the gate is already closed.
let _launched_processes_guard = match self.launched_processes.enter() {
@@ -368,9 +371,13 @@ impl PostgresRedoManager {
.context("launch walredo process")?,
_launched_processes_guard,
});
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
elapsed_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
proc
@@ -464,7 +471,9 @@ impl PostgresRedoManager {
}
});
WAL_REDO_TIME.observe(duration.as_secs_f64());
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
debug!(
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
@@ -529,7 +538,9 @@ impl PostgresRedoManager {
}
// Success!
let duration = start_time.elapsed();
// FIXME: using the same metric here creates a bimodal distribution by default, and because
// there could be multiple batch sizes this would be N+1 modal.
WAL_REDO_TIME.observe(duration.as_secs_f64());
debug!(
"neon applied {} WAL records in {} us to reconstruct page image at LSN {}",

View File

@@ -21,7 +21,7 @@ use utils::poison::Poison;
use self::no_leak_child::NoLeakChild;
use crate::config::PageServerConf;
use crate::metrics:: WalRedoKillCause;
use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause};
use crate::page_cache::PAGE_SZ;
use crate::span::debug_assert_current_span_has_tenant_id;
@@ -97,6 +97,7 @@ impl WalRedoProcess {
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
@@ -117,7 +118,12 @@ impl WalRedoProcess {
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
@@ -225,7 +231,7 @@ impl WalRedoProcess {
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let Ok(res) =
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await

View File

@@ -5,7 +5,7 @@ use std::process::{Child, Command};
use pageserver_api::shard::TenantShardId;
use tracing::{error, info, instrument};
use crate::metrics::WalRedoKillCause;
use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause};
/// Wrapper type around `std::process::Child` which guarantees that the child
/// will be killed and waited-for by this process before being dropped.
@@ -47,6 +47,9 @@ impl NoLeakChild {
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
scopeguard::defer! {
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
}
let res = child.kill();
if let Err(e) = res {
// This branch is very unlikely because:

View File

@@ -95,7 +95,7 @@ static uint32 local_request_counter;
* Various settings related to prompt (fast) handling of PageStream responses
* at any CHECK_FOR_INTERRUPTS point.
*/
int readahead_getpage_pull_timeout_ms = 50;
int readahead_getpage_pull_timeout_ms = 0;
static int PS_TIMEOUT_ID = 0;
static bool timeout_set = false;
static bool timeout_signaled = false;

View File

@@ -75,7 +75,7 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 3;
int neon_protocol_version = 2;
static int neon_compute_mode = 0;
static int max_reconnect_attempts = 60;
@@ -1362,7 +1362,7 @@ pg_init_libpagestore(void)
"",
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
check_neon_id, NULL, NULL);
DefineCustomStringVariable("neon.branch_id",
"Neon branch_id the server is running on",
NULL,
@@ -1370,7 +1370,7 @@ pg_init_libpagestore(void)
"",
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
check_neon_id, NULL, NULL);
DefineCustomStringVariable("neon.endpoint_id",
"Neon endpoint_id the server is running on",
NULL,
@@ -1378,7 +1378,7 @@ pg_init_libpagestore(void)
"",
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
check_neon_id, NULL, NULL);
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
@@ -1432,7 +1432,7 @@ pg_init_libpagestore(void)
"PageStream connection when we have pages which "
"were read ahead but not yet received.",
&readahead_getpage_pull_timeout_ms,
50, 0, 5 * 60 * 1000,
0, 0, 5 * 60 * 1000,
PGC_USERSET,
GUC_UNIT_MS,
NULL, NULL, NULL);
@@ -1440,7 +1440,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
3, /* use protocol version 3 */
2, /* use protocol version 2 */
2, /* min */
3, /* max */
PGC_SU_BACKEND,

View File

@@ -2040,7 +2040,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
/*
* neon_end_unlogged_build() -- Finish an unlogged rel build.
*
* Call this after you have finished WAL-logging a relation that was
* Call this after you have finished WAL-logging an relation that was
* first populated without WAL-logging.
*
* This removes the local copy of the rel, since it's now been fully
@@ -2059,35 +2059,14 @@ neon_end_unlogged_build(SMgrRelation reln)
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
{
XLogRecPtr recptr;
BlockNumber nblocks;
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
/*
* Update the last-written LSN cache.
*
* The relation is still on local disk so we can get the size by
* calling mdnblocks() directly. For the LSN, GetXLogInsertRecPtr() is
* very conservative. If we could assume that this function is called
* from the same backend that WAL-logged the contents, we could use
* XactLastRecEnd here. But better safe than sorry.
*/
nblocks = mdnblocks(reln, MAIN_FORKNUM);
recptr = GetXLogInsertRecPtr();
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
/* Make the relation look permanent again */
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
/* Remove local copy */
rinfob = InfoBFromSMgrRel(reln);
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",

View File

@@ -890,7 +890,7 @@ libpqwp_connect_start(char *conninfo)
* palloc will exit on failure though, so there's not much we could do if
* it *did* fail.
*/
conn = (WalProposerConn*)MemoryContextAllocZero(TopMemoryContext, sizeof(WalProposerConn));
conn = palloc(sizeof(WalProposerConn));
conn->pg_conn = pg_conn;
conn->is_nonblocking = false; /* connections always start in blocking
* mode */

View File

@@ -947,6 +947,8 @@ class NeonEnvBuilder:
continue
if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
continue
if FINAL_METRICS_FILE_NAME == test_file.name:
continue
log.debug(f"Removing large database {test_file} file")
test_file.unlink()
elif test_entry.is_dir():
@@ -2987,7 +2989,7 @@ class NeonPageserver(PgProtocol, LogUtils):
return
metrics = self.http_client().get_metrics_str()
metrics_snapshot_path = self.workdir / "final_metrics.txt"
metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
with open(metrics_snapshot_path, "w") as f:
f.write(metrics)
@@ -5154,6 +5156,9 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
)
FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
SKIP_DIRS = frozenset(
(
"pg_wal",

View File

@@ -258,7 +258,7 @@ def get_scale_for_db(size_mb: int) -> int:
ATTACHMENT_NAME_REGEX: re.Pattern[str] = re.compile(
r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)|final_metrics.txt"
r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
)

View File

@@ -29,7 +29,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
log.info("wait for all tenants to become active")
wait_until_all_tenants_state(
ps_http, "Active", iterations=10 + n_tenants, period=10, http_error_ok=False
ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
)
# ensure all layers are resident for predictiable performance
@@ -40,7 +40,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
for layer in info.historic_layers:
assert not layer.remote
env.storage_controller.reconcile_until_idle(timeout_secs=160)
env.storage_controller.reconcile_until_idle(timeout_secs=60)
log.info("ready")

View File

@@ -155,7 +155,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"compaction_algorithm": {
"kind": "tiered",
},
"compaction_shard_ancestor": False,
"eviction_policy": {
"kind": "LayerAccessThreshold",
"period": "20s",

View File

@@ -199,8 +199,6 @@ def test_pageserver_gc_compaction_preempt(
conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
env = neon_env_builder.init_start(initial_tenant_conf=conf)
env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

View File

@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
NeonPageserver,
StorageControllerMigrationConfig,
flush_ep_to_pageserver,
)
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.utils import (
@@ -998,6 +997,10 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
heatmap_after_migration = timeline_heatmap(timeline_id)
local_layers = ps_secondary.list_layers(tenant_id, timeline_id)
# We download 1 layer per second and give up within 5 seconds.
assert len(local_layers) < 10
after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
@@ -1035,13 +1038,8 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
.value
)
assert before == after
# Stop the endpoint and wait until any finally written WAL propagates to
# the pageserver and is uploaded to remote storage.
flush_ep_to_pageserver(env, workload.endpoint(), tenant_id, timeline_id)
ps_secondary.http_client().timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
workload.stop()
assert before == after
# Now simulate the case where a child timeline is archived, parent layers
# are evicted and the child is unarchived. When the child is unarchived,

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.4",
"eab3a37834cac6ec0719bf817ac918a201712d66"
"66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8"
],
"v16": [
"16.8",
"37496f87b5324af53c56127e278ee5b1e8435253"
"d56e79cd5d6136c159b1d8d98acb7981d4b69364"
],
"v15": [
"15.12",
"8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3"
"aeb292eeace9072e07071254b6ffc7a74007d4d2"
],
"v14": [
"14.17",
"d3c9d61fb7a362a165dac7060819dd9d6ad68c28"
"a0391901a2af13aa029b905272a5b2024133c926"
]
}

View File

@@ -70,6 +70,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] }
once_cell = { version = "1" }
p256 = { version = "0.13", features = ["jwk"] }
parquet = { version = "53", default-features = false, features = ["zstd"] }
pkcs8 = { version = "0.10", default-features = false, features = ["pem", "std"] }
prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
rand = { version = "0.8", features = ["small_rng"] }
regex = { version = "1" }