Launch multiple wal-redo postgres instances

Diable relish upload and backpressure
Merge branch 'main' into main_local
2026-01-20 11:52:56 +00:00 · 2021-12-08 13:02:57 +03:00 · 2021-12-08 10:30:45 +03:00 · 2021-12-06 17:22:20 +03:00 · 2021-12-06 17:21:48 +03:00
5 changed files with 25 additions and 19 deletions
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -287,14 +287,15 @@ impl PostgresNode {
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
+        conf.append("max_wal_size", "100GB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
        conf.append("wal_level", "replica");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
-        conf.append("wal_sender_timeout", "5s");
-        conf.append("max_replication_flush_lag", "160MB");
-        conf.append("max_replication_apply_lag", "1500MB");
+        //conf.append("wal_sender_timeout", "5s");
+        //conf.append("max_replication_flush_lag", "160MB");
+        //conf.append("max_replication_apply_lag", "1500MB");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -39,10 +39,10 @@ pub mod defaults {
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
+    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
+    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -106,7 +106,7 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
        conf,
        Arc::new(walredo_mgr),
        tenant_id,
-        true,
+        false,
    ));

    let mut m = access_tenants();
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -32,6 +32,7 @@ use std::os::unix::io::AsRawFd;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
@@ -53,6 +54,8 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
 use postgres_ffi::pg_constants;
 use postgres_ffi::XLogRecord;

+const N_WAL_REDO_PROCS: usize = 1;
+
 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
 ///
@@ -139,7 +142,8 @@ pub struct PostgresRedoManager {
    tenantid: ZTenantId,
    conf: &'static PageServerConf,

-    process: Mutex<Option<PostgresRedoProcess>>,
+    round_robin: AtomicUsize,
+    processes: [Mutex<Option<PostgresRedoProcess>>; N_WAL_REDO_PROCS],
 }

 #[derive(Debug)]
@@ -209,12 +213,13 @@ impl WalRedoManager for PostgresRedoManager {
            end_time = Instant::now();
            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
        } else {
-            let mut process_guard = self.process.lock().unwrap();
+            let rr = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_WAL_REDO_PROCS;
+            let mut process_guard = self.processes[rr].lock().unwrap();
            let lock_time = Instant::now();

            // launch the WAL redo process on first use
            if process_guard.is_none() {
-                let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?;
+                let p = PostgresRedoProcess::launch(self.conf, &self.tenantid, rr)?;
                *process_guard = Some(p);
            }
            let process = process_guard.as_mut().unwrap();
@@ -246,7 +251,8 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenantid,
            conf,
-            process: Mutex::new(None),
+            round_robin: AtomicUsize::new(0),
+            processes: [(); N_WAL_REDO_PROCS].map(|_| Mutex::new(None)),
        }
    }

@@ -472,11 +478,17 @@ impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<PostgresRedoProcess, Error> {
+    fn launch(
+        conf: &PageServerConf,
+        tenantid: &ZTenantId,
+        id: usize,
+    ) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
-        let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
+        let datadir = conf
+            .tenant_path(tenantid)
+            .join(format! {"wal-redo-datadir-{}", id});

        // Create empty data directory for wal-redo postgres, deleting old one first.
        if datadir.exists() {
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -36,17 +36,10 @@ pg_port = {pageserver_pg_port}
 http_port = {pageserver_http_port}
 auth_type = '{pageserver_auth_type}'

-[[safekeepers]]
-name = '{safekeeper_name}'
-pg_port = {safekeeper_pg_port}
-http_port = {safekeeper_http_port}
 "#,
        pageserver_pg_port = DEFAULT_PAGESERVER_PG_PORT,
        pageserver_http_port = DEFAULT_PAGESERVER_HTTP_PORT,
        pageserver_auth_type = AuthType::Trust,
-        safekeeper_name = DEFAULT_SAFEKEEPER_NAME,
-        safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT,
-        safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT,
    )
 }
Author	SHA1	Message	Date
Konstantin Knizhnik	24b1b412ee	Launch multiple wal-redo postgres instances	2021-12-08 13:02:57 +03:00
Konstantin Knizhnik	0ce4dca05a	Diable relish upload and backpressure	2021-12-08 10:30:45 +03:00
Konstantin Knizhnik	1530143e00	Merge branch 'main' into main_local	2021-12-06 17:22:20 +03:00
Konstantin Knizhnik	f77c9c987f	Use different default values	2021-12-06 17:21:48 +03:00