pageserver: suppress some of the most common spurious warnings (#5658)

Two of the most common spurious log messages:
- broker connections terminate & we log at error severity. Unfortunately
tonic gives us an "Unknown" error so to suppress these we're doing
string matching. It's hacky but worthwhile for operations.
- the first iteration of tenant background tasks tends to over-run its
schedule and emit a warning. Ultimately we should fix these to run on
time, but for now we are not benefiting from polluting our logs with the
warnings.
This commit is contained in:
John Spray
2023-10-25 14:55:37 +01:00
committed by GitHub
parent 4778b6a12e
commit 5683ae9eab
3 changed files with 38 additions and 17 deletions

View File

@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
if first && random_init_delay(period, &cancel).await.is_err() {
break;
}
let started_at = Instant::now();
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(
started_at.elapsed(),
period,
BackgroundLoopKind::Compaction,
);
}
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let period = tenant.get_gc_period();
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
if first && random_init_delay(period, &cancel).await.is_err() {
break;
}
let started_at = Instant::now();
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
}
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;

View File

@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::BrokerClientChannel;
use storage_broker::Streaming;
use storage_broker::{BrokerClientChannel, Code, Streaming};
use tokio::select;
use tracing::*;
@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
broker_update = broker_subscription.message() => {
match broker_update {
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
Err(e) => {
error!("broker subscription failed: {e}");
Err(status) => {
match status.code() {
Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
// tonic's error handling doesn't provide a clear code for disconnections: we get
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
info!("broker disconnected: {status}");
},
_ => {
warn!("broker subscription failed: {status}");
}
}
return ControlFlow::Continue(());
}
Ok(None) => {

View File

@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
use std::time::Duration;
use tonic::codegen::StdError;
use tonic::transport::{ClientTlsConfig, Endpoint};
use tonic::{transport::Channel, Code, Status};
use tonic::{transport::Channel, Status};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use proto::{
@@ -23,6 +23,7 @@ pub mod proto {
pub mod metrics;
// Re-exports to avoid direct tonic dependency in user crates.
pub use tonic::Code;
pub use tonic::Request;
pub use tonic::Streaming;