diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 87e8df2ab6..c38af9cb80 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -33,7 +33,6 @@ use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, }; -use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; /// The state of a tenant in this pageserver. @@ -1400,6 +1399,8 @@ pub enum PagestreamFeMessage { GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), GetSlruSegment(PagestreamGetSlruSegmentRequest), + #[cfg(feature = "testing")] + Test(PagestreamTestRequest), } // Wrapped in libpq CopyData @@ -1411,6 +1412,22 @@ pub enum PagestreamBeMessage { Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), GetSlruSegment(PagestreamGetSlruSegmentResponse), + #[cfg(feature = "testing")] + Test(PagestreamTestResponse), +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamFeMessageTag { + Exists = 0, + Nblocks = 1, + GetPage = 2, + DbSize = 3, + GetSlruSegment = 4, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 99, } // Keep in sync with `pagestore_client.h` @@ -1422,7 +1439,28 @@ enum PagestreamBeMessageTag { Error = 103, DbSize = 104, GetSlruSegment = 105, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 199, } + +impl TryFrom for PagestreamFeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(PagestreamFeMessageTag::Exists), + 1 => Ok(PagestreamFeMessageTag::Nblocks), + 2 => Ok(PagestreamFeMessageTag::GetPage), + 3 => Ok(PagestreamFeMessageTag::DbSize), + 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 99 => Ok(PagestreamFeMessageTag::Test), + _ => Err(value), + } + } +} + impl TryFrom for PagestreamBeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { @@ -1433,6 +1471,8 @@ impl TryFrom for PagestreamBeMessageTag { 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 199 => Ok(PagestreamBeMessageTag::Test), _ => Err(value), } } @@ -1550,6 +1590,20 @@ pub struct PagestreamDbSizeResponse { pub db_size: i64, } +#[cfg(feature = "testing")] +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PagestreamTestRequest { + pub hdr: PagestreamRequest, + pub batch_key: u64, + pub message: String, +} + +#[cfg(feature = "testing")] +#[derive(Debug)] +pub struct PagestreamTestResponse { + pub req: PagestreamTestRequest, +} + // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] @@ -1569,7 +1623,7 @@ impl PagestreamFeMessage { match self { Self::Exists(req) => { - bytes.put_u8(0); + bytes.put_u8(PagestreamFeMessageTag::Exists as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1580,7 +1634,7 @@ impl PagestreamFeMessage { } Self::Nblocks(req) => { - bytes.put_u8(1); + bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1591,7 +1645,7 @@ impl PagestreamFeMessage { } Self::GetPage(req) => { - bytes.put_u8(2); + bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1603,7 +1657,7 @@ impl PagestreamFeMessage { } Self::DbSize(req) => { - bytes.put_u8(3); + bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1611,13 +1665,24 @@ impl PagestreamFeMessage { } Self::GetSlruSegment(req) => { - bytes.put_u8(4); + bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } + #[cfg(feature = "testing")] + Self::Test(req) => { + bytes.put_u8(PagestreamFeMessageTag::Test as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u64(req.batch_key); + let message = req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } bytes.into() @@ -1645,56 +1710,66 @@ impl PagestreamFeMessage { ), }; - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, + match PagestreamFeMessageTag::try_from(msg_tag) + .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? + { + PagestreamFeMessageTag::Exists => { + Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::Nblocks => { + Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::GetPage => { + Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::DbSize => { + Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - blkno: body.read_u32::()?, - })), - 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - dbnode: body.read_u32::()?, - })), - 4 => Ok(PagestreamFeMessage::GetSlruSegment( + })) + } + PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { hdr: PagestreamRequest { reqid, @@ -1705,7 +1780,21 @@ impl PagestreamFeMessage { segno: body.read_u32::()?, }, )), - _ => bail!("unknown smgr message tag: {:?}", msg_tag), + #[cfg(feature = "testing")] + PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key: body.read_u64::()?, + message: { + let len = body.read_u64::()?; + let mut buf = vec![0; len as usize]; + body.read_exact(&mut buf)?; + String::from_utf8(buf)? + }, + })), } } } @@ -1748,6 +1837,15 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } PagestreamProtocolVersion::V3 => { @@ -1816,6 +1914,18 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } } @@ -1958,6 +2068,28 @@ impl PagestreamBeMessage { segment: segment.into(), }) } + #[cfg(feature = "testing")] + Tag::Test => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let batch_key = buf.read_u64::()?; + let len = buf.read_u64::()?; + let mut msg = vec![0; len as usize]; + buf.read_exact(&mut msg)?; + let message = String::from_utf8(msg)?; + Self::Test(PagestreamTestResponse { + req: PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key, + message, + }, + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -1977,6 +2109,8 @@ impl PagestreamBeMessage { Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", Self::GetSlruSegment(_) => "GetSlruSegment", + #[cfg(feature = "testing")] + Self::Test(_) => "Test", } } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9195951191..9c835c956b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"] +testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] [dependencies] anyhow.workspace = true @@ -114,3 +114,7 @@ harness = false [[bench]] name = "upload_queue" harness = false + +[[bin]] +name = "test_helper_slow_client_reads" +required-features = [ "testing" ] diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index d9b36bf3d4..f582d307a7 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = [ "pageserver_api/testing" ] + [dependencies] pageserver_api.workspace = true thiserror.workspace = true diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 207ec4166c..27280912b4 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,6 +1,9 @@ -use std::pin::Pin; +use std::sync::{Arc, Mutex}; -use futures::SinkExt; +use futures::{ + stream::{SplitSink, SplitStream}, + SinkExt, StreamExt, +}; use pageserver_api::{ models::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, @@ -10,7 +13,6 @@ use pageserver_api::{ }; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; -use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -62,15 +64,28 @@ impl Client { .client .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; + let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away. let Client { cancel_on_client_drop, conn_task, client: _, } = self; + let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning( + ConnTaskRunning { + cancel_on_client_drop, + conn_task, + }, + ))); Ok(PagestreamClient { - copy_both: Box::pin(copy_both), - conn_task, - cancel_on_client_drop, + sink: PagestreamSender { + shared: shared.clone(), + sink, + }, + stream: PagestreamReceiver { + shared: shared.clone(), + stream, + }, + shared, }) } @@ -97,7 +112,28 @@ impl Client { /// Create using [`Client::pagestream`]. pub struct PagestreamClient { - copy_both: Pin>>, + shared: Arc>, + sink: PagestreamSender, + stream: PagestreamReceiver, +} + +pub struct PagestreamSender { + #[allow(dead_code)] + shared: Arc>, + sink: SplitSink, bytes::Bytes>, +} + +pub struct PagestreamReceiver { + #[allow(dead_code)] + shared: Arc>, + stream: SplitStream>, +} + +enum PagestreamShared { + ConnTaskRunning(ConnTaskRunning), + ConnTaskCancelledJoinHandleReturnedOrDropped, +} +struct ConnTaskRunning { cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } @@ -110,11 +146,11 @@ pub struct RelTagBlockNo { impl PagestreamClient { pub async fn shutdown(self) { let Self { - copy_both, - cancel_on_client_drop: cancel_conn_task, - conn_task, - } = self; - // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`. + shared, + sink, + stream, + } = { self }; + // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`. // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). // @@ -131,27 +167,77 @@ impl PagestreamClient { // // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. // => https://github.com/neondatabase/neon/issues/6390 - let _ = cancel_conn_task.unwrap(); + let ConnTaskRunning { + cancel_on_client_drop, + conn_task, + } = { + let mut guard = shared.lock().unwrap(); + match std::mem::replace( + &mut *guard, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped, + ) { + PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(), + } + }; + let _ = cancel_on_client_drop.unwrap(); conn_task.await.unwrap(); - drop(copy_both); + + // Now drop the split copy_both. + drop(sink); + drop(stream); + } + + pub fn split(self) -> (PagestreamSender, PagestreamReceiver) { + let Self { + shared: _, + sink, + stream, + } = self; + (sink, stream) } pub async fn getpage( &mut self, req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamFeMessage::GetPage(req); - let req: bytes::Bytes = req.serialize(); - // let mut req = tokio_util::io::ReaderStream::new(&req); - let mut req = tokio_stream::once(Ok(req)); + self.getpage_send(req).await?; + self.getpage_recv().await + } - self.copy_both.send_all(&mut req).await?; + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.sink.getpage_send(req).await + } - let next: Option> = self.copy_both.next().await; + pub async fn getpage_recv(&mut self) -> anyhow::Result { + self.stream.getpage_recv().await + } +} + +impl PagestreamSender { + // TODO: maybe make this impl Sink instead for better composability? + pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> { + let msg = msg.serialize(); + self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?; + Ok(()) + } + + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.send(PagestreamFeMessage::GetPage(req)).await + } +} + +impl PagestreamReceiver { + // TODO: maybe make this impl Stream instead for better composability? + pub async fn recv(&mut self) -> anyhow::Result { + let next: Option> = self.stream.next().await; let next: bytes::Bytes = next.unwrap()?; + PagestreamBeMessage::deserialize(next) + } - let msg = PagestreamBeMessage::deserialize(next)?; - match msg { + pub async fn getpage_recv(&mut self) -> anyhow::Result { + let next: PagestreamBeMessage = self.recv().await?; + match next { PagestreamBeMessage::GetPage(p) => Ok(p), PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) @@ -160,7 +246,14 @@ impl PagestreamClient { | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", - msg.kind() + next.kind() + ) + } + #[cfg(feature = "testing")] + PagestreamBeMessage::Test(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + next.kind() ) } } diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs new file mode 100644 index 0000000000..c1ce332b6c --- /dev/null +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -0,0 +1,65 @@ +use std::{ + io::{stdin, stdout, Read, Write}, + time::Duration, +}; + +use clap::Parser; +use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +#[derive(clap::Parser)] +struct Args { + connstr: String, + tenant_id: TenantId, + timeline_id: TimelineId, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let Args { + connstr, + tenant_id, + timeline_id, + } = Args::parse(); + let client = pageserver_client::page_service::Client::new(connstr).await?; + let client = client.pagestream(tenant_id, timeline_id).await?; + let (mut sender, _receiver) = client.split(); + + eprintln!("filling the pipe"); + let mut msg = 0; + loop { + msg += 1; + let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test( + PagestreamTestRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(23), + not_modified_since: Lsn(23), + }, + batch_key: 42, + message: format!("message {}", msg), + }, + )); + let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { + eprintln!("pipe seems full"); + break; + }; + let _: () = res?; + } + + let n = stdout().write(b"R")?; + assert_eq!(n, 1); + stdout().flush()?; + + eprintln!("waiting for signal to tell us to exit"); + + let mut buf = [0u8; 1]; + stdin().read_exact(&mut buf)?; + + eprintln!("termination signal received, exiting"); + + anyhow::Ok(()) +} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 5b1cbbad63..3c4830e3cd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1463,6 +1463,8 @@ pub enum SmgrQueryType { GetPageAtLsn, GetDbSize, GetSlruSegment, + #[cfg(feature = "testing")] + Test, } pub(crate) struct SmgrQueryTimePerTimeline { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index da4180a927..b14a44f9e3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -555,37 +555,52 @@ struct BatchedGetPageRequest { timer: SmgrOpTimer, } +#[cfg(feature = "testing")] +struct BatchedTestRequest { + req: models::PagestreamTestRequest, + timer: SmgrOpTimer, +} + +/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, +/// so that we don't keep the [`Timeline::gate`] open while the batch +/// is being built up inside the [`spsc_fold`] (pagestream pipelining). enum BatchedFeMessage { Exists { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamNblocksRequest, }, GetPage { span: Span, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, effective_request_lsn: Lsn, pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, }, DbSize { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamGetSlruSegmentRequest, }, + #[cfg(feature = "testing")] + Test { + span: Span, + shard: timeline::handle::WeakHandle, + requests: Vec, + }, RespondError { span: Span, error: BatchedPageStreamError, @@ -606,6 +621,12 @@ impl BatchedFeMessage { page.timer.observe_execution_start(at); } } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { requests, .. } => { + for req in requests { + req.timer.observe_execution_start(at); + } + } BatchedFeMessage::RespondError { .. } => {} } } @@ -735,7 +756,7 @@ impl PageServerHandler { BatchedFeMessage::Exists { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -754,7 +775,7 @@ impl PageServerHandler { BatchedFeMessage::Nblocks { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -773,7 +794,7 @@ impl PageServerHandler { BatchedFeMessage::DbSize { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -792,7 +813,7 @@ impl PageServerHandler { BatchedFeMessage::GetSlruSegment { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -844,6 +865,7 @@ impl PageServerHandler { ) .await?; + // We're holding the Handle let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, req.hdr.request_lsn, @@ -861,11 +883,27 @@ impl PageServerHandler { }; BatchedFeMessage::GetPage { span, - shard, + shard: shard.downgrade(), effective_request_lsn, pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], } } + #[cfg(feature = "testing")] + PagestreamFeMessage::Test(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_test_request"); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = + record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) + .await?; + BatchedFeMessage::Test { + span, + shard: shard.downgrade(), + requests: vec![BatchedTestRequest { req, timer }], + } + } }; Ok(Some(batched_msg)) } @@ -907,9 +945,7 @@ impl PageServerHandler { assert_eq!(accum_pages.len(), max_batch_size.get()); return false; } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { + if !accum_shard.is_same_handle_as(&this_shard) { trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. @@ -928,6 +964,44 @@ impl PageServerHandler { accum_pages.extend(this_pages); Ok(()) } + #[cfg(feature = "testing")] + ( + Ok(BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }), + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) if (|| { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return false; + } + if !accum_shard.is_same_handle_as(&this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return false; + } + true + })() => + { + // ok to batch + accum_requests.extend(this_requests); + Ok(()) + } // something batched already but this message is unbatchable (_, this_msg) => { // by default, don't continue batching @@ -969,7 +1043,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::exists"); ( vec![self - .handle_get_rel_exists_request(&shard, &req, ctx) + .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -986,7 +1060,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::nblocks"); ( vec![self - .handle_get_nblocks_request(&shard, &req, ctx) + .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1007,7 +1081,7 @@ impl PageServerHandler { trace!(npages, "handling getpage request"); let res = self .handle_get_page_at_lsn_request_batched( - &shard, + &*shard.upgrade()?, effective_request_lsn, pages, ctx, @@ -1029,7 +1103,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::dbsize"); ( vec![self - .handle_db_size_request(&shard, &req, ctx) + .handle_db_size_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1046,7 +1120,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); ( vec![self - .handle_get_slru_segment_request(&shard, &req, ctx) + .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1054,6 +1128,27 @@ impl PageServerHandler { span, ) } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } BatchedFeMessage::RespondError { span, error } => { // We've already decided to respond with an error, so we don't need to // call the handler. @@ -1791,6 +1886,51 @@ impl PageServerHandler { )) } + // NB: this impl mimics what we do for batched getpage requests. + #[cfg(feature = "testing")] + #[instrument(skip_all, fields(shard_id))] + async fn handle_test_request_batch( + &mut self, + timeline: &Timeline, + requests: Vec, + _ctx: &RequestContext, + ) -> Vec> { + // real requests would do something with the timeline + let mut results = Vec::with_capacity(requests.len()); + for _req in requests.iter() { + tokio::task::yield_now().await; + + results.push({ + if timeline.cancel.is_cancelled() { + Err(PageReconstructError::Cancelled) + } else { + Ok(()) + } + }); + } + + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|(req, res)| { + res.map(|()| { + ( + PagestreamBeMessage::Test(models::PagestreamTestResponse { + req: req.req.clone(), + }), + req.timer, + ) + }) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) + }), + ) + } + /// Note on "fullbackup": /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, @@ -2406,6 +2546,14 @@ impl From for QueryError { } } +impl From for QueryError { + fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self { + match e { + crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown, + } + } +} + fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); tracing::Span::current().record( diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 47c4a8637d..a006647785 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient { cancel: CancellationToken, } +impl Drop for RemoteTimelineClient { + fn drop(&mut self) { + debug!("dropping RemoteTimelineClient"); + } +} + impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f24611e1d8..2ba71416b8 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -76,6 +76,7 @@ use std::{pin::pin, sync::OnceLock}; use crate::{ aux_file::AuxFileSizeEstimator, + page_service::TenantManagerTypes, tenant::{ config::AttachmentMode, layer_map::{LayerMap, SearchResult}, @@ -431,7 +432,7 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, - pub(crate) handles: handle::PerTimelineState, + pub(crate) handles: handle::PerTimelineState, pub(crate) attach_wal_lag_cooldown: Arc>, @@ -4625,6 +4626,10 @@ impl Drop for Timeline { } } } + info!( + "Timeline {} for tenant {} is being dropped", + self.timeline_id, self.tenant_shard_id.tenant_id + ); } } diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index e82559b8b3..35d8c75ce1 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -32,54 +32,151 @@ //! //! # Design //! +//! ## Data Structures +//! //! There are three user-facing data structures: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. -//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows +//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. //! -//! The `Handle` is just a wrapper around an `Arc`. +//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. +//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. //! -//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. -//! The `Cache` stores a `Weak` for each cached Timeline. +//! The `HandleInner` is allocated as a `Arc>` and +//! referenced weakly and strongly from various places which we are now illustrating. +//! For brevity, we will omit the `Arc>` part in the following and instead +//! use `strong ref` and `weak ref` when referring to the `Arc>` +//! or `Weak>`, respectively. +//! +//! - The `Handle` is a strong ref. +//! - The `WeakHandle` is a weak ref. +//! - The `PerTimelineState` contains a `HashMap`. +//! - The `Cache` is a `HashMap`. +//! +//! Lifetimes: +//! - `WeakHandle` and `Handle`: single pagestream request. +//! - `Cache`: single page service connection. +//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`). +//! +//! ## Request Handling Flow (= filling and using the `Cache``) //! //! To dispatch a request, the page service connection calls `Cache::get`. //! //! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and construct an -//! `Arc`. We store a `Weak` in the cache -//! and the `Arc` in the `PerTimelineState`. +//! resulting in an `Arc`. We enter its gate _once_ and store it in the the +//! `Arc>>`. A weak ref is stored in the `Cache` +//! and a strong ref in the `PerTimelineState`. +//! A strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing -//! and find the `Weak` in the cache. -//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! and find the weak ref in the cache. +//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`. //! -//! The request handler dispatches the request to the right `>::$request_method`. +//! The pagestream processing is pipelined and involves a batching step. +//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. +//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` +//! and the request handler dispatches the request to the right `>::$request_method`. //! It then drops the `Handle`, which drops the `Arc`. //! -//! # Memory Management / How The Reference Cycle Is Broken +//! # Performance //! -//! The attentive reader may have noticed the strong reference cycle -//! from `Arc` to `PerTimelineState` to `Arc`. +//! Remember from the introductory section: //! -//! This cycle is intentional: while it exists, the `Cache` can upgrade its -//! `Weak` to an `Arc` in a single atomic operation. +//! > However, we want to avoid the overhead of entering the gate for every +//! > method invocation. +//! +//! Why do we want to avoid that? +//! Because the gate is a shared location in memory and entering it involves +//! bumping refcounts, which leads to cache contention if done frequently +//! from multiple cores in parallel. +//! +//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. +//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! (Review the "Data Structures" section if that is unclear to you.) +//! +//! A `WeakHandle` is a weak ref to the `HandleInner`. +//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and +//! further acquire an additional strong ref to the `Arc` inside it. +//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. +//! +//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. +//! Again, this is cheap because the `Arc` is private to the connection. +//! +//! In addition to the GateGuard, we need to provide `Deref` impl. +//! For this, both `Handle` need infallible access to an `Arc`. +//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention +//! on the shared memory location that trakcs the refcount of the `Arc`. +//! Instead, we wrap the `Arc` into another `Arc`. +//! so that we can clone it cheaply when upgrading a `WeakHandle`. +//! +//! # Shutdown +//! +//! The attentive reader may have noticed the following reference cycle around the `Arc`: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! ``` +//! +//! Further, there is this cycle: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! ``` +//! +//! The former cycle is a memory leak if not broken. +//! The latter cycle further prevents the Timeline from shutting down +//! because we certainly won't drop the Timeline while the GateGuard is alive. +//! Preventing shutdown is the whole point of this handle/cache system, +//! but when the Timeline needs to shut down, we need to break the cycle. //! //! The cycle is broken by either -//! - `PerTimelineState::shutdown` or -//! - dropping the `Cache`. +//! - Timeline shutdown (=> `PerTimelineState::shutdown`) +//! - Connection shutdown (=> dropping the `Cache`). //! -//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the +//! `Arc`. +//! +//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, +//! thereby breaking the cycle. +//! It also initiates draining of already existing `Handle`s by +//! poisoning things so that no new `HandleInner`'s can be added +//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail. +//! +//! Concurrently existing / already upgraded `Handle`s will extend the +//! lifetime of the `Arc>` and hence cycles. //! However, since `Handle`s are short-lived and new `Handle`s are not -//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, -//! that extension of the cycle is bounded. +//! handed out from `Cache::get` or `WeakHandle::upgrade` after +//! `PerTimelineState::shutdown`, that extension of the cycle is bounded. +//! +//! Concurrently existing `WeakHandle`s will fail to `upgrade()`: +//! while they will succeed in upgrading `Weak>`, +//! they will find the inner in state `HandleInner::ShutDown` state where the +//! `Arc` and Timeline has already been dropped. +//! +//! Dropping the `Cache` undoes the registration of this `Cache`'s +//! `HandleInner`s from all the `PerTimelineState`s, i.e., it +//! removes the strong ref to each of its `HandleInner`s +//! from all the `PerTimelineState`. +//! +//! # Locking Rules +//! +//! To prevent deadlocks we: +//! +//! 1. Only ever hold one of the locks at a time. +//! 2. Don't add more than one Drop impl that locks on the +//! cycles above. +//! +//! As per (2), that impl is in `Drop for Cache`. //! //! # Fast Path for Shard Routing //! //! The `Cache` has a fast path for shard routing to avoid calling into //! the tenant manager for every request. //! -//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s. //! //! The current implementation uses the first entry in the hash map //! to determine the `ShardParameters` and derive the correct @@ -87,18 +184,18 @@ //! //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. //! -//! If the lookup is successful and the `Weak` can be upgraded, +//! If the lookup is successful and the `WeakHandle` can be upgraded, //! it's a hit. //! //! ## Cache invalidation //! -//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The insight is that cache invalidation is sufficient and most efficiently if done lazily. //! The only reasons why an entry in the cache can become stale are: //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is //! being detached, timeline or shard deleted, or pageserver is shutting down. //! 2. We're doing a shard split and new traffic should be routed to the child shards. //! -//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the //! timeline has shut down, and when that happens, we remove the entry from the cache. //! //! Regarding (2), the insight is that it is toally fine to keep dispatching requests @@ -107,8 +204,6 @@ use std::collections::hash_map; use std::collections::HashMap; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::sync::Weak; @@ -152,7 +247,7 @@ pub(crate) struct Cache { map: Map, } -type Map = HashMap>>; +type Map = HashMap>; impl Default for Cache { fn default() -> Self { @@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId { } /// See module-level comment. -pub(crate) struct Handle(Arc>); -struct HandleInner { - shut_down: AtomicBool, - timeline: T::Timeline, - // The timeline's gate held open. - _gate_guard: utils::sync::gate::GateGuard, +pub(crate) struct Handle { + timeline: Arc, + #[allow(dead_code)] // the field exists to keep the gate open + gate_guard: Arc, + inner: Arc>>, +} +pub(crate) struct WeakHandle { + inner: Weak>>, +} +enum HandleInner { + KeepingTimelineGateOpen { + #[allow(dead_code)] + gate_guard: Arc, + timeline: Arc, + }, + ShutDown, } /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. @@ -183,7 +288,8 @@ struct HandleInner { /// See module-level comment for details. pub struct PerTimelineState { // None = shutting down - handles: Mutex>>>>, + #[allow(clippy::type_complexity)] + handles: Mutex>>>>>, } impl Default for PerTimelineState { @@ -243,49 +349,24 @@ impl Cache { shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetError> { - // terminates because each iteration removes an element from the map - loop { - let handle = self - .get_impl(timeline_id, shard_selector, tenant_manager) - .await?; - if handle.0.shut_down.load(Ordering::Relaxed) { - let removed = self - .map - .remove(&handle.0.timeline.shard_timeline_id()) - .expect("invariant of get_impl is that the returned handle is in the map"); - assert!( - Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), - "shard_timeline_id() incorrect?" - ); - } else { - return Ok(handle); - } - } - } - - #[instrument(level = "trace", skip_all)] - async fn get_impl( - &mut self, - timeline_id: TimelineId, - shard_selector: ShardSelector, - tenant_manager: &T::TenantManager, - ) -> Result, GetError> { - let miss: ShardSelector = { + // terminates because when every iteration we remove an element from the map + let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); match routing_state { RoutingResult::FastPath(handle) => return Ok(handle), RoutingResult::SlowPath(key) => match self.map.get(&key) { Some(cached) => match cached.upgrade() { - Some(upgraded) => return Ok(Handle(upgraded)), - None => { + Ok(upgraded) => return Ok(upgraded), + Err(HandleUpgradeError::ShutDown) => { + // TODO: dedup with shard_routing() trace!("handle cache stale"); self.map.remove(&key).unwrap(); - ShardSelector::Known(key.shard_index) + continue; } }, - None => ShardSelector::Known(key.shard_index), + None => break ShardSelector::Known(key.shard_index), }, - RoutingResult::NeedConsultTenantManager => shard_selector, + RoutingResult::NeedConsultTenantManager => break shard_selector, } }; self.get_miss(timeline_id, miss, tenant_manager).await @@ -302,7 +383,7 @@ impl Cache { let Some((first_key, first_handle)) = self.map.iter().next() else { return RoutingResult::NeedConsultTenantManager; }; - let Some(first_handle) = first_handle.upgrade() else { + let Ok(first_handle) = first_handle.upgrade() else { // TODO: dedup with get() trace!("handle cache stale"); let first_key_owned = *first_key; @@ -310,7 +391,7 @@ impl Cache { continue; }; - let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let first_handle_shard_identity = first_handle.get_shard_identity(); let make_shard_index = |shard_num: ShardNumber| ShardIndex { shard_number: shard_num, shard_count: first_handle_shard_identity.count, @@ -329,11 +410,11 @@ impl Cache { }; let first_handle_shard_timeline_id = ShardTimelineId { shard_index: first_handle_shard_identity.shard_index(), - timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + timeline_id: first_handle.shard_timeline_id().timeline_id, }; if need_shard_timeline_id == first_handle_shard_timeline_id { - return RoutingResult::FastPath(Handle(first_handle)); + return RoutingResult::FastPath(first_handle); } else { return RoutingResult::SlowPath(need_shard_timeline_id); } @@ -357,23 +438,30 @@ impl Cache { ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } - let gate_guard = match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }; trace!("creating new HandleInner"); - let handle = Arc::new( - // TODO: global metric that keeps track of the number of live HandlerTimeline instances - // so we can identify reference cycle bugs. - HandleInner { - shut_down: AtomicBool::new(false), - _gate_guard: gate_guard, - timeline: timeline.clone(), - }, - ); - let handle = { + let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { + gate_guard: Arc::new( + // this enter() is expensive in production code because + // it hits the global Arc::gate refcounts + match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }, + ), + // this clone is expensive in production code because + // it hits the global Arc::clone refcounts + timeline: Arc::new(timeline.clone()), + })); + let handle_weak = WeakHandle { + inner: Arc::downgrade(&handle_inner_arc), + }; + let handle = handle_weak + .upgrade() + .ok() + .expect("we just created it and it's not linked anywhere yet"); + { let mut lock_guard = timeline .per_timeline_state() .handles @@ -381,7 +469,8 @@ impl Cache { .expect("mutex poisoned"); match &mut *lock_guard { Some(per_timeline_state) => { - let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + let replaced = + per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); assert!(replaced.is_none(), "some earlier code left a stale handle"); match self.map.entry(key) { hash_map::Entry::Occupied(_o) => { @@ -392,8 +481,7 @@ impl Cache { unreachable!() } hash_map::Entry::Vacant(v) => { - v.insert(Arc::downgrade(&handle)); - handle + v.insert(handle_weak); } } } @@ -401,14 +489,62 @@ impl Cache { return Err(GetError::PerTimelineStateShutDown); } } - }; - Ok(Handle(handle)) + } + Ok(handle) } Err(e) => Err(GetError::TenantManager(e)), } } } +pub(crate) enum HandleUpgradeError { + ShutDown, +} + +impl WeakHandle { + pub(crate) fn upgrade(&self) -> Result, HandleUpgradeError> { + let Some(inner) = Weak::upgrade(&self.inner) else { + return Err(HandleUpgradeError::ShutDown); + }; + let lock_guard = inner.lock().expect("poisoned"); + match &*lock_guard { + HandleInner::KeepingTimelineGateOpen { + timeline, + gate_guard, + } => { + let gate_guard = Arc::clone(gate_guard); + let timeline = Arc::clone(timeline); + drop(lock_guard); + Ok(Handle { + timeline, + gate_guard, + inner, + }) + } + HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), + } + } + + pub(crate) fn is_same_handle_as(&self, other: &WeakHandle) -> bool { + Weak::ptr_eq(&self.inner, &other.inner) + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl Handle { + pub(crate) fn downgrade(&self) -> WeakHandle { + WeakHandle { + inner: Arc::downgrade(&self.inner), + } + } +} + impl PerTimelineState { /// After this method returns, [`Cache::get`] will never again return a [`Handle`] /// to the [`Types::Timeline`] that embeds this per-timeline state. @@ -430,43 +566,54 @@ impl PerTimelineState { trace!("already shut down"); return; }; - for handle in handles.values() { + for handle_inner_arc in handles.values() { // Make hits fail. - handle.shut_down.store(true, Ordering::Relaxed); + let mut lock_guard = handle_inner_arc.lock().expect("poisoned"); + lock_guard.shutdown(); } drop(handles); } } -impl std::ops::Deref for Handle { - type Target = T::Timeline; - fn deref(&self) -> &Self::Target { - &self.0.timeline - } -} - -#[cfg(test)] -impl Drop for HandleInner { - fn drop(&mut self) { - trace!("HandleInner dropped"); - } -} - // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. impl Drop for Cache { fn drop(&mut self) { - for (_, weak) in self.map.drain() { - if let Some(strong) = weak.upgrade() { - // handle is still being kept alive in PerTimelineState - let timeline = strong.timeline.per_timeline_state(); - let mut handles = timeline.handles.lock().expect("mutex poisoned"); - if let Some(handles) = &mut *handles { - let Some(removed) = handles.remove(&self.id) else { - // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. - continue; - }; - assert!(Arc::ptr_eq(&removed, &strong)); - } + for ( + _, + WeakHandle { + inner: handle_inner_weak, + }, + ) in self.map.drain() + { + let Some(handle_inner_arc) = handle_inner_weak.upgrade() else { + continue; + }; + let handle_timeline = handle_inner_arc + // locking rules: drop lock before acquiring other lock below + .lock() + .expect("poisoned") + .shutdown(); + let per_timeline_state = handle_timeline.per_timeline_state(); + let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned"); + let Some(handles) = &mut *handles_lock_guard else { + continue; + }; + let Some(removed_handle_inner_arc) = handles.remove(&self.id) else { + // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. + continue; + }; + drop(handles_lock_guard); // locking rules: remember them when! + assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,)); + } + } +} + +impl HandleInner { + fn shutdown(&mut self) -> Arc { + match std::mem::replace(self, HandleInner::ShutDown) { + HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline, + HandleInner::ShutDown => { + unreachable!("handles are only shut down once in their lifetime"); } } } @@ -474,6 +621,8 @@ impl Drop for Cache { #[cfg(test)] mod tests { + use std::sync::Weak; + use pageserver_api::{ key::{rel_block_to_key, Key, DBDIR_KEY}, models::ShardParameters, @@ -583,39 +732,13 @@ mod tests { // // fill the cache // - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); - let handle: Handle<_> = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); - let handle_inner_weak = Arc::downgrade(&handle.0); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); - assert_eq!( - ( - Weak::strong_count(&handle_inner_weak), - Weak::weak_count(&handle_inner_weak) - ), - (2, 2), - "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" - ); assert_eq!(cache.map.len(), 1); - - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); drop(handle); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); // // demonstrate that Handle holds up gate closure @@ -640,21 +763,11 @@ mod tests { // SHUTDOWN shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown - assert_eq!( - 1, - Weak::strong_count(&handle_inner_weak), - "through local var handle" - ); assert_eq!( cache.map.len(), 1, "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(via handle), shard0, mgr; weak: myself" - ); // this handle is perfectly usable handle.getpage(); @@ -678,16 +791,6 @@ mod tests { } drop(handle); - assert_eq!( - 0, - Weak::strong_count(&handle_inner_weak), - "the HandleInner destructor already ran" - ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); // closing gate succeeds after dropping handle tokio::select! { @@ -706,10 +809,8 @@ mod tests { assert_eq!(cache.map.len(), 0); // ensure all refs to shard0 are gone and we're not leaking anything - let myself = Weak::clone(&shard0.myself); drop(shard0); drop(mgr); - assert_eq!(Weak::strong_count(&myself), 0); } #[tokio::test] @@ -948,15 +1049,11 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.0)); + used_handles.push(Arc::downgrade(&handle.timeline)); } - // No handles exist, thus gates are closed and don't require shutdown - assert!(used_handles - .iter() - .all(|weak| Weak::strong_count(weak) == 0)); - - // ... thus the gate should close immediately, even without shutdown + // No handles exist, thus gates are closed and don't require shutdown. + // Thus the gate should close immediately, even without shutdown. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { @@ -964,4 +1061,75 @@ mod tests { } } } + + #[tokio::test(start_paused = true)] + async fn test_weak_handles() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + + let refcount_start = Arc::strong_count(&shard0); + + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + let weak_handle = handle.downgrade(); + + drop(handle); + + let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it"); + + // Start shutdown + shard0.per_timeline_state.shutdown(); + + // Upgrades during shutdown don't work, even if upgraded_handle exists. + weak_handle + .upgrade() + .err() + .expect("can't upgrade weak handle as soon as shutdown started"); + + // But upgraded_handle is still alive, so the gate won't close. + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + // Drop the last handle. + drop(upgraded_handle); + + // The gate should close now, despite there still being a weak_handle. + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("only strong handle is dropped and we shut down per-timeline-state") + } + } + + // The weak handle still can't be upgraded. + weak_handle + .upgrade() + .err() + .expect("still shouldn't be able to upgrade the weak handle"); + + // There should be no strong references to the timeline object except the one on "stack". + assert_eq!(Arc::strong_count(&shard0), refcount_start); + } } diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index b751235595..7b748d7252 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -34,6 +34,8 @@ typedef enum T_NeonGetPageRequest, T_NeonDbSizeRequest, T_NeonGetSlruSegmentRequest, + /* future tags above this line */ + T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */ /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -42,6 +44,8 @@ typedef enum T_NeonErrorResponse, T_NeonDbSizeResponse, T_NeonGetSlruSegmentResponse, + /* future tags above this line */ + T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */ } NeonMessageTag; typedef uint64 NeonRequestId; diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py new file mode 100644 index 0000000000..fa85e1210b --- /dev/null +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -0,0 +1,60 @@ +# NB: there are benchmarks that double-serve as tests inside the `performance` directory. + +import subprocess +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct +@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"]) +def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str): + def patch_pageserver_toml(config): + config["page_service_pipelining"] = { + "mode": "pipelined", + "max_batch_size": 32, + "execution": "concurrent-futures", + } + + neon_env_builder.pageserver_config_override = patch_pageserver_toml + env = neon_env_builder.init_start() + + log.info("make flush appear slow") + + log.info("sending requests until pageserver accepts no more") + # TODO: extract this into a helper, like subprocess_capture, + # so that we capture the stderr from the helper somewhere. + child = subprocess.Popen( + [ + neon_binpath / "test_helper_slow_client_reads", + env.pageserver.connstr(), + str(env.initial_tenant), + str(env.initial_timeline), + ], + bufsize=0, # unbuffered + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + assert child.stdout is not None + buf = child.stdout.read(1) + if len(buf) != 1: + raise Exception("unexpected EOF") + if buf != b"R": + raise Exception(f"unexpected data: {buf!r}") + log.info("helper reports pageserver accepts no more requests") + log.info( + "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace" + ) + + if kind == "pageserver-stop": + log.info("try to shut down the pageserver cleanly") + env.pageserver.stop() + elif kind == "tenant-detach": + log.info("try to shut down the tenant") + env.pageserver.tenant_detach(env.initial_tenant) + else: + raise ValueError(f"unexpected kind: {kind}") + + log.info("shutdown did not time out, test passed")