From f1eb7032569c35ec47806c5e736486508d559439 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:35:21 -0400 Subject: [PATCH] fix(pageserver): use a buffer for basebackup; add aux basebackup metrics log (#9401) Our replication bench project is stuck because it is too slow to generate basebackup and it caused compute to disconnect. https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419 The compute timeout for waiting for basebackup is 10m (is it true?). Generating basebackup directly on pageserver takes ~3min. Therefore, I suspect it's because there are too many wasted round-trip time for writing the 10000+ snapshot aux files. Also, it is possible that the basebackup process takes too long time retrieving all aux files that it did not write anything over the wire protocol, causing a read timeout. Basebackup size is 800KB gzipped for that project and was 55MB tar before compression. ## Summary of changes * Potentially fix the issue by placing a write buffer for basebackup. * Log how many aux files did we read + the time spent on it. Signed-off-by: Alex Chi Z --- pageserver/src/basebackup.rs | 21 +++++++++++++++++---- pageserver/src/page_service.rs | 10 +++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a32d09f3b3..975318419f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -16,7 +16,7 @@ use fail::fail_point; use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use tokio::io; use tokio::io::AsyncWrite; use tracing::*; @@ -352,12 +352,25 @@ where } } - for (path, content) in self + let start_time = Instant::now(); + let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx) .await - .map_err(|e| BasebackupError::Server(e.into()))? - { + .map_err(|e| BasebackupError::Server(e.into()))?; + let aux_scan_time = start_time.elapsed(); + let aux_estimated_size = aux_files + .values() + .map(|content| content.len()) + .sum::(); + info!( + "Scanned {} aux files in {}ms, aux file content size = {}", + aux_files.len(), + aux_scan_time.as_millis(), + aux_estimated_size + ); + + for (path, content) in aux_files { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8fa6b9a7f0..afb2f92ff8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -26,8 +26,8 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; -use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -1137,10 +1137,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } else { - let mut writer = pgb.copyout_writer(); + let mut writer = BufWriter::new(pgb.copyout_writer()); if gzip { let mut encoder = GzipEncoder::with_quality( - writer, + &mut writer, // NOTE using fast compression because it's on the critical path // for compute startup. For an empty database, we get // <100KB with this method. The Level::Best compression method @@ -1175,6 +1175,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } + writer + .flush() + .await + .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; } pgb.write_message_noflush(&BeMessage::CopyDone)