mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
pageserver: suppress some of the most common spurious warnings (#5658)
Two of the most common spurious log messages: - broker connections terminate & we log at error severity. Unfortunately tonic gives us an "Unknown" error so to suppress these we're doing string matching. It's hacky but worthwhile for operations. - the first iteration of tenant background tasks tends to over-run its schedule and emit a warning. Ultimately we should fix these to run on time, but for now we are not benefiting from polluting our logs with the warnings.
This commit is contained in:
@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
|
|
||||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||||
if first {
|
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||||
first = false;
|
break;
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let started_at = Instant::now();
|
let started_at = Instant::now();
|
||||||
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
if !first {
|
||||||
|
// The first iteration is typically much slower, because all tenants compete for the
|
||||||
|
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||||
|
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||||
|
warn_when_period_overrun(
|
||||||
|
started_at.elapsed(),
|
||||||
|
period,
|
||||||
|
BackgroundLoopKind::Compaction,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||||
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
first = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.await;
|
.await;
|
||||||
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
|
|
||||||
let period = tenant.get_gc_period();
|
let period = tenant.get_gc_period();
|
||||||
|
|
||||||
if first {
|
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||||
first = false;
|
break;
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let started_at = Instant::now();
|
let started_at = Instant::now();
|
||||||
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
if !first {
|
||||||
|
// The first iteration is typically much slower, because all tenants compete for the
|
||||||
|
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||||
|
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||||
|
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||||
|
}
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||||
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
first = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.await;
|
.await;
|
||||||
|
|||||||
@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
|||||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||||
use storage_broker::BrokerClientChannel;
|
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||||
use storage_broker::Streaming;
|
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
broker_update = broker_subscription.message() => {
|
broker_update = broker_subscription.message() => {
|
||||||
match broker_update {
|
match broker_update {
|
||||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||||
Err(e) => {
|
Err(status) => {
|
||||||
error!("broker subscription failed: {e}");
|
match status.code() {
|
||||||
|
Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
|
||||||
|
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||||
|
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||||
|
info!("broker disconnected: {status}");
|
||||||
|
},
|
||||||
|
_ => {
|
||||||
|
warn!("broker subscription failed: {status}");
|
||||||
|
}
|
||||||
|
}
|
||||||
return ControlFlow::Continue(());
|
return ControlFlow::Continue(());
|
||||||
}
|
}
|
||||||
Ok(None) => {
|
Ok(None) => {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tonic::codegen::StdError;
|
use tonic::codegen::StdError;
|
||||||
use tonic::transport::{ClientTlsConfig, Endpoint};
|
use tonic::transport::{ClientTlsConfig, Endpoint};
|
||||||
use tonic::{transport::Channel, Code, Status};
|
use tonic::{transport::Channel, Status};
|
||||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||||
|
|
||||||
use proto::{
|
use proto::{
|
||||||
@@ -23,6 +23,7 @@ pub mod proto {
|
|||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
|
|
||||||
// Re-exports to avoid direct tonic dependency in user crates.
|
// Re-exports to avoid direct tonic dependency in user crates.
|
||||||
|
pub use tonic::Code;
|
||||||
pub use tonic::Request;
|
pub use tonic::Request;
|
||||||
pub use tonic::Streaming;
|
pub use tonic::Streaming;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user