Merge with master

2026-01-03 19:42:55 +00:00 · 2021-04-09 15:00:33 +03:00
parent 07fb30747a 542dffa4a6
commit 1816c4ca0a
5 changed files with 53 additions and 11 deletions
--- a/integration_tests/tests/control_plane/mod.rs
+++ b/integration_tests/tests/control_plane/mod.rs
@@ -178,6 +178,7 @@ impl PageServerNode {
            .arg("--skip-recovery")
            .env_clear()
            .env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
+            .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
            .status()
            .expect("failed to start pageserver");

--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -373,8 +373,8 @@ impl PageCache {
                        lsn
 					);
                    return Err(format!(
-                        "Timed out while waiting for WAL record at LSN {} to arrive",
-                        lsn
+                        "Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive",
+                        lsn >> 32, lsn & 0xffff_ffff
                    ))?;
                }
            }
@@ -383,11 +383,8 @@ impl PageCache {
            }

            if lsn < shared.first_valid_lsn {
-				error!(
-					"LSN {} has already been removed",
-                    lsn
-				);
-                return Err(format!("LSN {} has already been removed", lsn))?;
+                return Err(format!("LSN {:X}/{:X} has already been removed",
+                                   lsn >> 32, lsn & 0xffff_ffff))?;
            }
        }
        let mut buf = BytesMut::new();
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -154,12 +154,14 @@ impl WalRedoProcess {
                Command::new("initdb")
                    .args(&["-D", datadir.to_str().unwrap()])
                    .arg("-N")
-                    .status(),
+                    .output(),
            )
            .expect("failed to execute initdb");

-        if !initdb.success() {
-            panic!("initdb failed");
+        if !initdb.status.success() {
+            panic!("initdb failed: {}\nstderr:\n{}",
+                   std::str::from_utf8(&initdb.stdout).unwrap(),
+                   std::str::from_utf8(&initdb.stderr).unwrap());
        }

        // Start postgres itself
--- a/walkeeper/README
+++ b/walkeeper/README
@@ -0,0 +1,38 @@
+# WAL safekeeper
+
+Also know as the WAL service, WAL keeper or WAL acceptor.
+
+The WAL safekeeper acts as a holding area and redistribution center
+for recently generated WAL. The primary Postgres server streams the
+WAL to the WAL safekeeper, and treats it like a (synchronous)
+replica. A replication slot is used in the primary to prevent the
+primary from discarding WAL that hasn't been streamed to the
+safekeeper yet.
+
+The primary connects to the WAL safekeeper, so it works in a "push"
+fashion.  That's different from how streaming replication usually
+works, where the replica initiates the connection. To do that, there
+is a component called "safekeeper_proxy". The safekeeper_proxy runs on
+the same host as the primary Postgres server and connects to it to do
+streaming replication.  It also connects to the WAL safekeeper, and
+forwards all the WAL. (PostgreSQL's archive_commands works in the
+"push" style, but it operates on a WAL segment granularity. If
+PostgreSQL had a push style API for streaming, we wouldn't need the
+proxy).
+
+The Page Server connects to the WAL safekeeper, using the same
+streaming replication protocol that's used between Postgres primary
+and standby. You can also connect the Page Server directly to a
+primary PostgreSQL node for testing.
+
+In a production installation, there are multiple WAL safekeepers
+running on different nodes, and there is a quorum mechanism using the
+Paxos algorithm to ensure that a piece of WAL is considered as durable
+only after it has been flushed to disk on more than half of the WAL
+safekeepers. The Paxos and crash recovery algorithm ensures that only
+one primary node can be actively streaming WAL to the quorum of
+safekeepers.
+
+
+See vendor/postgres/src/bin/safekeeper/README.md for a more detailed
+desription of the consensus protocol. (TODO: move the text here?)
--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
@@ -585,7 +585,7 @@ impl Connection {
    fn set_system(&mut self, id: SystemId) -> Result<()> {
        let mut systems = SYSTEMS.lock().unwrap();
        if id == 0 {
-            // non-multitenant configuration: just sigle instance
+            // non-multitenant configuration: just a single instance
            if let Some(system) = systems.values().next() {
                self.system = Some(system.clone());
                return Ok(());
@@ -937,6 +937,10 @@ impl Connection {

        /*
         * Always start streaming at the beginning of a segment
+         *
+         * FIXME: It is common practice to start streaming at the beginning of
+         * the segment, but it should be up to the client to decide that. We
+         * shouldn't enforce that here.
         */
        start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64;