fix(page_service): getpage requests don't hold applied_gc_cutoff_lsn guard (#12743)

Before this PR, getpage requests wouldn't hold the
`applied_gc_cutoff_lsn` guard until they were done.

Theoretical impact: if we’re not holding the `RcuReadGuard`, gc can
theoretically concurrently delete reconstruct data that we need to
reconstruct the page.

I don't think this practically occurs in production because the odds of
it happening are quite low, especially for primary read_write computes.
But RO replicas / standby_horizon relies on correct
`applied_gc_cutofff_lsn`, so, I'm fixing this as part of the work ok
replacing standby_horizon propagation mechanism with leases (LKB-88).

The change is feature-gated with a feature flag, and evaluated once when
entering `handle_pagestream` to avoid performance impact.

For observability, we add a field to the `handle_pagestream` span, and a
slow-log to the place in `gc_loop` where it waits for the in-flight
RcuReadGuard's to drain.

refs
- fixes https://databricks.atlassian.net/browse/LKB-2572
- standby_horizon leases epic:
https://databricks.atlassian.net/browse/LKB-2572

---------

Co-authored-by: Christian Schwarz <Christian Schwarz>
This commit is contained in:
Christian Schwarz
2025-07-25 22:25:04 +02:00
committed by GitHub
parent 25718e324a
commit 19b74b8837
3 changed files with 93 additions and 7 deletions

View File

@@ -715,7 +715,7 @@ fn start_pageserver(
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
feature_resolver,
feature_resolver.clone(),
)
.context("Failed to initialize router state")?,
);
@@ -841,6 +841,7 @@ fn start_pageserver(
} else {
None
},
feature_resolver.clone(),
);
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for each request/stream.

View File

@@ -68,6 +68,7 @@ use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::feature_resolver::FeatureResolver;
use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
@@ -139,6 +140,7 @@ pub fn spawn(
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
feature_resolver: FeatureResolver,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -160,6 +162,7 @@ pub fn spawn(
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
feature_resolver,
libpq_ctx,
cancel.clone(),
)
@@ -218,6 +221,7 @@ pub async fn libpq_listener_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
feature_resolver: FeatureResolver,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
@@ -261,6 +265,7 @@ pub async fn libpq_listener_main(
auth_type,
tls_config.clone(),
pipelining_config.clone(),
feature_resolver.clone(),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
@@ -303,6 +308,7 @@ async fn page_service_conn_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
feature_resolver: FeatureResolver,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -370,6 +376,7 @@ async fn page_service_conn_main(
perf_span_fields,
connection_ctx,
cancel.clone(),
feature_resolver.clone(),
gate_guard,
);
let pgbackend =
@@ -421,6 +428,8 @@ struct PageServerHandler {
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
feature_resolver: FeatureResolver,
gate_guard: GateGuard,
}
@@ -587,6 +596,15 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
}
}
/// Whether to hold the applied GC cutoff guard when processing GetPage requests.
/// This is determined once at the start of pagestream subprotocol handling based on
/// feature flags, configuration, and test conditions.
#[derive(Debug, Clone, Copy)]
enum HoldAppliedGcCutoffGuard {
Yes,
No,
}
#[derive(thiserror::Error, Debug)]
enum PageStreamError {
/// We encountered an error that should prompt the client to reconnect:
@@ -730,6 +748,7 @@ enum BatchedFeMessage {
GetPage {
span: Span,
shard: WeakHandle<TenantManagerTypes>,
applied_gc_cutoff_guard: Option<RcuReadGuard<Lsn>>,
pages: SmallVec<[BatchedGetPageRequest; 1]>,
batch_break_reason: GetPageBatchBreakReason,
},
@@ -909,6 +928,7 @@ impl PageServerHandler {
perf_span_fields: ConnectionPerfSpanFields,
connection_ctx: RequestContext,
cancel: CancellationToken,
feature_resolver: FeatureResolver,
gate_guard: GateGuard,
) -> Self {
PageServerHandler {
@@ -920,6 +940,7 @@ impl PageServerHandler {
cancel,
pipelining_config,
get_vectored_concurrent_io,
feature_resolver,
gate_guard,
}
}
@@ -959,6 +980,7 @@ impl PageServerHandler {
ctx: &RequestContext,
protocol_version: PagestreamProtocolVersion,
parent_span: Span,
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
) -> Result<Option<BatchedFeMessage>, QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
@@ -1196,19 +1218,27 @@ impl PageServerHandler {
})
.await?;
let applied_gc_cutoff_guard = shard.get_applied_gc_cutoff_lsn(); // hold guard
// We're holding the Handle
let effective_lsn = match Self::effective_request_lsn(
&shard,
shard.get_last_record_lsn(),
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
&applied_gc_cutoff_guard,
) {
Ok(lsn) => lsn,
Err(e) => {
return respond_error!(span, e);
}
};
let applied_gc_cutoff_guard = match hold_gc_cutoff_guard {
HoldAppliedGcCutoffGuard::Yes => Some(applied_gc_cutoff_guard),
HoldAppliedGcCutoffGuard::No => {
drop(applied_gc_cutoff_guard);
None
}
};
let batch_wait_ctx = if ctx.has_perf_span() {
Some(
@@ -1229,6 +1259,7 @@ impl PageServerHandler {
BatchedFeMessage::GetPage {
span,
shard: shard.downgrade(),
applied_gc_cutoff_guard,
pages: smallvec![BatchedGetPageRequest {
req,
timer,
@@ -1329,13 +1360,28 @@ impl PageServerHandler {
match (eligible_batch, this_msg) {
(
BatchedFeMessage::GetPage {
pages: accum_pages, ..
pages: accum_pages,
applied_gc_cutoff_guard: accum_applied_gc_cutoff_guard,
..
},
BatchedFeMessage::GetPage {
pages: this_pages, ..
pages: this_pages,
applied_gc_cutoff_guard: this_applied_gc_cutoff_guard,
..
},
) => {
accum_pages.extend(this_pages);
// the minimum of the two guards will keep data for both alive
match (&accum_applied_gc_cutoff_guard, this_applied_gc_cutoff_guard) {
(None, None) => (),
(None, Some(this)) => *accum_applied_gc_cutoff_guard = Some(this),
(Some(_), None) => (),
(Some(accum), Some(this)) => {
if **accum > *this {
*accum_applied_gc_cutoff_guard = Some(this);
}
}
};
Ok(())
}
#[cfg(feature = "testing")]
@@ -1650,6 +1696,7 @@ impl PageServerHandler {
BatchedFeMessage::GetPage {
span,
shard,
applied_gc_cutoff_guard,
pages,
batch_break_reason,
} => {
@@ -1669,6 +1716,7 @@ impl PageServerHandler {
.instrument(span.clone())
.await;
assert_eq!(res.len(), npages);
drop(applied_gc_cutoff_guard);
res
},
span,
@@ -1750,7 +1798,7 @@ impl PageServerHandler {
/// Coding discipline within this function: all interaction with the `pgb` connection
/// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
/// This is so that we can shutdown page_service quickly.
#[instrument(skip_all)]
#[instrument(skip_all, fields(hold_gc_cutoff_guard))]
async fn handle_pagerequests<IO>(
&mut self,
pgb: &mut PostgresBackend<IO>,
@@ -1796,6 +1844,30 @@ impl PageServerHandler {
.take()
.expect("implementation error: timeline_handles should not be locked");
// Evaluate the expensive feature resolver check once per pagestream subprotocol handling
// instead of once per GetPage request. This is shared between pipelined and serial paths.
let hold_gc_cutoff_guard = if cfg!(test) || cfg!(feature = "testing") {
HoldAppliedGcCutoffGuard::Yes
} else {
// Use the global feature resolver with the tenant ID directly, avoiding the need
// to get a timeline/shard which might not be available on this pageserver node.
let empty_properties = std::collections::HashMap::new();
match self.feature_resolver.evaluate_boolean(
"page-service-getpage-hold-applied-gc-cutoff-guard",
tenant_id,
&empty_properties,
) {
Ok(()) => HoldAppliedGcCutoffGuard::Yes,
Err(_) => HoldAppliedGcCutoffGuard::No,
}
};
// record it in the span of handle_pagerequests so that both the request_span
// and the pipeline implementation spans contains the field.
Span::current().record(
"hold_gc_cutoff_guard",
tracing::field::debug(&hold_gc_cutoff_guard),
);
let request_span = info_span!("request");
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
@@ -1809,6 +1881,7 @@ impl PageServerHandler {
pipelining_config,
protocol_version,
io_concurrency,
hold_gc_cutoff_guard,
&ctx,
)
.await
@@ -1823,6 +1896,7 @@ impl PageServerHandler {
request_span,
protocol_version,
io_concurrency,
hold_gc_cutoff_guard,
&ctx,
)
.await
@@ -1851,6 +1925,7 @@ impl PageServerHandler {
request_span: Span,
protocol_version: PagestreamProtocolVersion,
io_concurrency: IoConcurrency,
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -1872,6 +1947,7 @@ impl PageServerHandler {
ctx,
protocol_version,
request_span.clone(),
hold_gc_cutoff_guard,
)
.await;
let msg = match msg {
@@ -1919,6 +1995,7 @@ impl PageServerHandler {
pipelining_config: PageServicePipeliningConfigPipelined,
protocol_version: PagestreamProtocolVersion,
io_concurrency: IoConcurrency,
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
ctx: &RequestContext,
) -> (
(PostgresBackendReader<IO>, TimelineHandles),
@@ -2022,6 +2099,7 @@ impl PageServerHandler {
&ctx,
protocol_version,
request_span.clone(),
hold_gc_cutoff_guard,
)
.await;
let Some(read_res) = read_res.transpose() else {
@@ -2068,6 +2146,7 @@ impl PageServerHandler {
pages,
span: _,
shard: _,
applied_gc_cutoff_guard: _,
batch_break_reason: _,
} = &mut batch
{

View File

@@ -70,7 +70,7 @@ use tracing::*;
use utils::generation::Generation;
use utils::guard_arc_swap::GuardArcSwap;
use utils::id::TimelineId;
use utils::logging::{MonitorSlowFutureCallback, monitor_slow_future};
use utils::logging::{MonitorSlowFutureCallback, log_slow, monitor_slow_future};
use utils::lsn::{AtomicLsn, Lsn, RecordLsn};
use utils::postgres_client::PostgresClientProtocol;
use utils::rate_limit::RateLimit;
@@ -6898,7 +6898,13 @@ impl Timeline {
write_guard.store_and_unlock(new_gc_cutoff)
};
waitlist.wait().await;
let waitlist_wait_fut = std::pin::pin!(waitlist.wait());
log_slow(
"applied_gc_cutoff waitlist wait",
Duration::from_secs(30),
waitlist_wait_fut,
)
.await;
info!("GC starting");