Merge pull request #3758 from neondatabase/releases/2023-03-07

Release 2023-03-07
2026-06-04 05:50:38 +00:00 · 2023-03-07 12:38:47 +01:00
parent 51d9dfeaa3 ca85646df4
commit bc0ab741af
4 changed files with 90 additions and 68 deletions
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,11 +6,13 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::path::Path;
+use std::time::Duration;

 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{error, info, warn};
+
+use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
@@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
    fs::File::open(path).await?.sync_all().await
 }

+static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
+
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>(
            // TODO: this doesn't use the cached fd for some reason?
            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
                format!(
-                    "Failed to create a destination file for layer '{}'",
+                    "create a destination file for layer '{}'",
                    temp_file_path.display()
                )
            })
            .map_err(DownloadError::Other)?;
            let mut download = storage.download(&remote_path).await.with_context(|| {
                format!(
-                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
            })
            .map_err(DownloadError::Other)?;
-            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-            })
-            .map_err(DownloadError::Other)?;
+
+            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
+                .await
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+                .with_context(|| {
+                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+                })
+                .map_err(DownloadError::Other)?;
+
            Ok((destination_file, bytes_amount))
+
        },
        &format!("download {remote_path:?}"),
    ).await?;
@@ -300,7 +310,7 @@ where
            }
            Err(DownloadError::Other(ref err)) => {
                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
                return result;
            }
        }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3819,7 +3819,7 @@ impl Timeline {
                    remote_layer.ongoing_download.close();
                } else {
                    // Keep semaphore open. We'll drop the permit at the end of the function.
-                    info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
+                    error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
                }

                // Don't treat it as an error if the task that triggered the download
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -256,52 +256,53 @@ impl PostgresRedoManager {
        pg_version: u32,
    ) -> Result<Bytes, WalRedoError> {
        let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
-
+        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let start_time = Instant::now();
+        let mut n_attempts = 0u32;
+        loop {
+            let mut proc = self.stdin.lock().unwrap();
+            let lock_time = Instant::now();

-        let mut proc = self.stdin.lock().unwrap();
-        let lock_time = Instant::now();
+            // launch the WAL redo process on first use
+            if proc.is_none() {
+                self.launch(&mut proc, pg_version)?;
+            }
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

-        // launch the WAL redo process on first use
-        if proc.is_none() {
-            self.launch(&mut proc, pg_version)?;
-        }
-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            let result = self
+                .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
+                .map_err(WalRedoError::IoError);

-        // Relational WAL records are applied using wal-redo-postgres
-        let buf_tag = BufferTag { rel, blknum };
-        let result = self
-            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
-            .map_err(WalRedoError::IoError);
+            let end_time = Instant::now();
+            let duration = end_time.duration_since(lock_time);

-        let end_time = Instant::now();
-        let duration = end_time.duration_since(lock_time);
+            let len = records.len();
+            let nbytes = records.iter().fold(0, |acumulator, record| {
+                acumulator
+                    + match &record.1 {
+                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                    }
+            });

-        let len = records.len();
-        let nbytes = records.iter().fold(0, |acumulator, record| {
-            acumulator
-                + match &record.1 {
-                    NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                    _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                }
-        });
+            WAL_REDO_TIME.observe(duration.as_secs_f64());
+            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

-        WAL_REDO_TIME.observe(duration.as_secs_f64());
-        WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-        WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+            debug!(
+				"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+				len,
+				nbytes,
+				duration.as_micros(),
+				lsn
+			);

-        debug!(
-            "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-            len,
-            nbytes,
-            duration.as_micros(),
-            lsn
-        );
-
-        // If something went wrong, don't try to reuse the process. Kill it, and
-        // next request will launch a new one.
-        if result.is_err() {
-            error!(
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if result.is_err() {
+                error!(
                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
                records.len(),
 				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
@@ -310,24 +311,28 @@ impl PostgresRedoManager {
 				base_img_lsn,
                lsn
            );
-            // self.stdin only holds stdin & stderr as_raw_fd().
-            // Dropping it as part of take() doesn't close them.
-            // The owning objects (ChildStdout and ChildStderr) are stored in
-            // self.stdout and self.stderr, respsectively.
-            // We intentionally keep them open here to avoid a race between
-            // currently running `apply_wal_records()` and a `launch()` call
-            // after we return here.
-            // The currently running `apply_wal_records()` must not read from
-            // the newly launched process.
-            // By keeping self.stdout and self.stderr open here, `launch()` will
-            // get other file descriptors for the new child's stdout and stderr,
-            // and hence the current `apply_wal_records()` calls will observe
-            //  `output.stdout.as_raw_fd() != stdout_fd` .
-            if let Some(proc) = self.stdin.lock().unwrap().take() {
-                proc.child.kill_and_wait();
+                // self.stdin only holds stdin & stderr as_raw_fd().
+                // Dropping it as part of take() doesn't close them.
+                // The owning objects (ChildStdout and ChildStderr) are stored in
+                // self.stdout and self.stderr, respsectively.
+                // We intentionally keep them open here to avoid a race between
+                // currently running `apply_wal_records()` and a `launch()` call
+                // after we return here.
+                // The currently running `apply_wal_records()` must not read from
+                // the newly launched process.
+                // By keeping self.stdout and self.stderr open here, `launch()` will
+                // get other file descriptors for the new child's stdout and stderr,
+                // and hence the current `apply_wal_records()` calls will observe
+                //  `output.stdout.as_raw_fd() != stdout_fd` .
+                if let Some(proc) = self.stdin.lock().unwrap().take() {
+                    proc.child.kill_and_wait();
+                }
+            }
+            n_attempts += 1;
+            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+                return result;
            }
        }
-        result
    }

    ///
@@ -771,7 +776,7 @@ impl PostgresRedoManager {
        &self,
        mut input: MutexGuard<Option<ProcessInput>>,
        tag: BufferTag,
-        base_img: Option<Bytes>,
+        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> Result<Bytes, std::io::Error> {
@@ -787,7 +792,7 @@ impl PostgresRedoManager {
        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
+            build_push_page_msg(tag, img, &mut writebuf);
        }
        for (lsn, rec) in records.iter() {
            if let NeonWalRecord::Postgres {
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -191,7 +191,8 @@ pub struct SafeKeeperState {
    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
    /// of last record streamed to everyone). Persisting it helps skipping
    /// recovery in walproposer, generally we compute it from peers. In
-    /// walproposer proto called 'truncate_lsn'.
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
    pub peer_horizon_lsn: Lsn,
    /// LSN of the oldest known checkpoint made by pageserver and successfully
    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
@@ -682,7 +683,7 @@ where
            term: self.state.acceptor_state.term,
            vote_given: false as u64,
            flush_lsn: self.flush_lsn(),
-            truncate_lsn: self.state.peer_horizon_lsn,
+            truncate_lsn: self.inmem.peer_horizon_lsn,
            term_history: self.get_term_history(),
            timeline_start_lsn: self.state.timeline_start_lsn,
        };
@@ -878,7 +879,13 @@ where
        if msg.h.commit_lsn != Lsn(0) {
            self.update_commit_lsn(msg.h.commit_lsn)?;
        }
-        self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
+        // Value calculated by walproposer can always lag:
+        // - safekeepers can forget inmem value and send to proposer lower
+        //   persisted one on restart;
+        // - if we make safekeepers always send persistent value,
+        //   any compute restart would pull it down.
+        // Thus, take max before adopting.
+        self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn);

        // Update truncate and commit LSN in control file.
        // To avoid negative impact on performance of extra fsync, do it only