diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8c179a9add..decc82112d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -412,31 +412,53 @@ impl PageServerHandler { // TODO: We could create a new per-request context here, with unique ID. // Currently we use the same per-timeline context for all requests - let response = match neon_fe_msg { + let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists); - self.handle_get_rel_exists_request(&timeline, &req, &ctx) - .await + let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn); + ( + self.handle_get_rel_exists_request(&timeline, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) } PagestreamFeMessage::Nblocks(req) => { let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize); - self.handle_get_nblocks_request(&timeline, &req, &ctx).await + let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn); + ( + self.handle_get_nblocks_request(&timeline, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) } PagestreamFeMessage::GetPage(req) => { let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn); - self.handle_get_page_at_lsn_request(&timeline, &req, &ctx) - .await + let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); + ( + self.handle_get_page_at_lsn_request(&timeline, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) } PagestreamFeMessage::DbSize(req) => { let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize); - self.handle_db_size_request(&timeline, &req, &ctx).await + let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn); + ( + self.handle_db_size_request(&timeline, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) } }; let response = response.unwrap_or_else(|e| { // print the all details to the log with {:#}, but for the client the // error message is enough - error!("error reading relation or page version: {:?}", e); + span.in_scope(|| error!("error reading relation or page version: {:#}", e)); PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) @@ -627,7 +649,6 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, @@ -648,7 +669,6 @@ impl PageServerHandler { })) } - #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, @@ -667,7 +687,6 @@ impl PageServerHandler { })) } - #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, @@ -689,7 +708,6 @@ impl PageServerHandler { })) } - #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c266b616e2..507b9cbac2 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -307,15 +307,17 @@ impl PostgresRedoManager { // If something went wrong, don't try to reuse the process. Kill it, and // next request will launch a new one. - if result.is_err() { + if let Err(e) = result.as_ref() { error!( - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}", + n_attempts, + "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}: {}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), nbytes, base_img_lsn, - lsn + lsn, + utils::error::report_compact_sources(e), ); // self.stdin only holds stdin & stderr as_raw_fd(). // Dropping it as part of take() doesn't close them. @@ -333,6 +335,8 @@ impl PostgresRedoManager { if let Some(proc) = self.stdin.lock().unwrap().take() { proc.child.kill_and_wait(); } + } else if n_attempts != 0 { + info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {