diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index c1b28e62f5..d0ba2bd815 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -7,19 +7,19 @@ // local installations. // +use std::fs::File; use std::fs::{self, OpenOptions}; use std::path::{Path, PathBuf}; use std::process::Command; use std::str; +use std::sync::Arc; use std::{ io::Write, net::{IpAddr, Ipv4Addr, SocketAddr}, }; -use std::sync::Arc; -use std::fs::File; -use postgres::{Client, NoTls}; use lazy_static::lazy_static; +use postgres::{Client, NoTls}; lazy_static! { // postgres would be there if it was build by 'make postgres' here in the repo @@ -34,7 +34,6 @@ lazy_static! { .join("tmp_check"); } - // Find the directory where the binaries were put (i.e. target/debug/) pub fn cargo_bin_dir() -> PathBuf { let mut pathbuf = std::env::current_exe().ok().unwrap(); @@ -66,7 +65,7 @@ impl StorageControlPlane { let pserver = PageServerNode { page_service_addr: "127.0.0.1:65200".parse().unwrap(), - data_dir: TEST_WORKDIR.join("pageserver") + data_dir: TEST_WORKDIR.join("pageserver"), }; pserver.init(); pserver.start(); @@ -75,27 +74,28 @@ impl StorageControlPlane { cplane } - - pub fn fault_tolerant(redundancy : usize) -> StorageControlPlane { + pub fn fault_tolerant(redundancy: usize) -> StorageControlPlane { let mut cplane = StorageControlPlane { wal_acceptors: Vec::new(), page_servers: Vec::new(), }; - const WAL_ACCEPTOR_PORT : usize = 54321; + const WAL_ACCEPTOR_PORT: usize = 54321; - for i in 0..redundancy { - let wal_acceptor = WalAcceptorNode { - listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i).parse().unwrap(), - data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)) - }; - wal_acceptor.init(); - wal_acceptor.start(); - cplane.wal_acceptors.push(wal_acceptor); - } - cplane - } + for i in 0..redundancy { + let wal_acceptor = WalAcceptorNode { + listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i) + .parse() + .unwrap(), + data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)), + }; + wal_acceptor.init(); + wal_acceptor.start(); + cplane.wal_acceptors.push(wal_acceptor); + } + cplane + } - // // postgres <-> wal_acceptor x3 <-> page_server + // // postgres <-> wal_acceptor x3 <-> page_server // fn local(&mut self) -> StorageControlPlane { // } @@ -104,8 +104,12 @@ impl StorageControlPlane { } pub fn get_wal_acceptor_conn_info(&self) -> String { - self.wal_acceptors.iter().map(|wa|wa.listen.to_string().to_string()).collect::>().join(",") - } + self.wal_acceptors + .iter() + .map(|wa| wa.listen.to_string().to_string()) + .collect::>() + .join(",") + } pub fn page_server_psql(&self, sql: &str) -> Vec { let addr = &self.page_servers[0].page_service_addr; @@ -206,7 +210,11 @@ impl WalAcceptorNode { } pub fn start(&self) { - println!("Starting wal_acceptor in {} listening '{}'", self.data_dir.to_str().unwrap(), self.listen); + println!( + "Starting wal_acceptor in {} listening '{}'", + self.data_dir.to_str().unwrap(), + self.listen + ); let status = Command::new(BIN_DIR.join("wal_acceptor")) .args(&["-D", self.data_dir.to_str().unwrap()]) @@ -224,16 +232,15 @@ impl WalAcceptorNode { pub fn stop(&self) { let pidfile = self.data_dir.join("wal_acceptor.pid"); if let Ok(pid) = fs::read_to_string(pidfile) { - let _status = Command::new("kill") - .arg(pid) - .env_clear() - .status() - .expect("failed to execute kill"); - } - } + let _status = Command::new("kill") + .arg(pid) + .env_clear() + .status() + .expect("failed to execute kill"); + } + } } - impl Drop for WalAcceptorNode { fn drop(&mut self) { self.stop(); @@ -255,8 +262,7 @@ pub struct ComputeControlPlane<'a> { } impl ComputeControlPlane<'_> { - - pub fn local(storage_cplane : &StorageControlPlane) -> ComputeControlPlane { + pub fn local(storage_cplane: &StorageControlPlane) -> ComputeControlPlane { ComputeControlPlane { pg_bin_dir: PG_BIN_DIR.to_path_buf(), work_dir: TEST_WORKDIR.to_path_buf(), @@ -311,7 +317,8 @@ impl ComputeControlPlane<'_> { // listen for selected port node.append_conf( "postgresql.conf", - format!("\ + format!( + "\ max_wal_senders = 10\n\ max_replication_slots = 10\n\ hot_standby = on\n\ @@ -320,14 +327,19 @@ impl ComputeControlPlane<'_> { wal_level = replica\n\ listen_addresses = '{address}'\n\ port = {port}\n\ - ", address = node.ip, port = node.port).as_str()); + ", + address = node.ip, + port = node.port + ) + .as_str(), + ); node } - // Init compute node without files, only datadir structure - // use initdb --compute-node flag and GUC 'computenode_mode' - // to distinguish the node + // Init compute node without files, only datadir structure + // use initdb --compute-node flag and GUC 'computenode_mode' + // to distinguish the node pub fn new_minimal_node(&mut self) -> &PostgresNode { // allocate new node entry with generated port let node_id = self.nodes.len() + 1; @@ -362,7 +374,8 @@ impl ComputeControlPlane<'_> { // listen for selected port node.append_conf( "postgresql.conf", - format!("\ + format!( + "\ max_wal_senders = 10\n\ max_replication_slots = 10\n\ hot_standby = on\n\ @@ -372,7 +385,12 @@ impl ComputeControlPlane<'_> { listen_addresses = '{address}'\n\ port = {port}\n\ computenode_mode = true\n\ - ", address = node.ip, port = node.port).as_str()); + ", + address = node.ip, + port = node.port + ) + .as_str(), + ); node } @@ -384,18 +402,29 @@ impl ComputeControlPlane<'_> { let pserver = storage_cplane.page_server_addr(); // Configure that node to take pages from pageserver - node.append_conf("postgresql.conf", format!("\ + node.append_conf( + "postgresql.conf", + format!( + "\ page_server_connstring = 'host={} port={}'\n\ - ", pserver.ip(), pserver.port()).as_str()); + ", + pserver.ip(), + pserver.port() + ) + .as_str(), + ); node.clone() } - pub fn new_master_node(&mut self) -> Arc { + pub fn new_master_node(&mut self) -> Arc { let node = self.new_vanilla_node(); - node.append_conf("postgresql.conf", "synchronous_standby_names = 'safekeeper_proxy'\n\ - "); + node.append_conf( + "postgresql.conf", + "synchronous_standby_names = 'safekeeper_proxy'\n\ + ", + ); node.clone() } } @@ -403,7 +432,7 @@ impl ComputeControlPlane<'_> { /////////////////////////////////////////////////////////////////////////////// pub struct WalProposerNode { - pid: u32 + pid: u32, } impl WalProposerNode { @@ -423,7 +452,7 @@ impl WalProposerNode { impl Drop for WalProposerNode { fn drop(&mut self) { self.stop(); - } + } } /////////////////////////////////////////////////////////////////////////////// @@ -449,12 +478,18 @@ impl PostgresNode { fn pg_ctl(&self, args: &[&str], check_ok: bool) { let pg_ctl_path = self.pg_bin_dir.join("pg_ctl"); let pg_ctl = Command::new(pg_ctl_path) - .args([&[ - "-D", - self.pgdata.to_str().unwrap(), - "-l", - self.pgdata.join("log").to_str().unwrap(), - ], args].concat()) + .args( + [ + &[ + "-D", + self.pgdata.to_str().unwrap(), + "-l", + self.pgdata.join("log").to_str().unwrap(), + ], + args, + ] + .concat(), + ) .env_clear() .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) .status() @@ -466,10 +501,10 @@ impl PostgresNode { } pub fn start(&self, storage_cplane: &StorageControlPlane) { - if storage_cplane.page_servers.len() != 0 { - let _res = storage_cplane - .page_server_psql(format!("callmemaybe {}", self.connstr()).as_str()); - } + if storage_cplane.page_servers.len() != 0 { + let _res = + storage_cplane.page_server_psql(format!("callmemaybe {}", self.connstr()).as_str()); + } println!("Starting postgres node at '{}'", self.connstr()); self.pg_ctl(&["start"], true); } @@ -509,7 +544,7 @@ impl PostgresNode { ); let mut client = Client::connect(connstring.as_str(), NoTls).unwrap(); - println!("Running {}", sql); + println!("Running {}", sql); client.query(sql, &[]).unwrap() } @@ -524,14 +559,12 @@ impl PostgresNode { Client::connect(connstring.as_str(), NoTls).unwrap() } - pub fn get_pgdata(&self) -> Option<&str> - { + pub fn get_pgdata(&self) -> Option<&str> { self.pgdata.to_str() } /* Create stub controlfile and respective xlog to start computenode */ - pub fn setup_controlfile(&self) - { + pub fn setup_controlfile(&self) { let filepath = format!("{}/global/pg_control", self.pgdata.to_str().unwrap()); { @@ -541,34 +574,32 @@ impl PostgresNode { let pg_resetwal_path = self.pg_bin_dir.join("pg_resetwal"); let pg_resetwal = Command::new(pg_resetwal_path) - .args(&["-D", self.pgdata.to_str().unwrap()]) - .arg("-f") - // TODO probably we will have to modify pg_resetwal - // .arg("--compute-node") - .status() - .expect("failed to execute pg_resetwal"); + .args(&["-D", self.pgdata.to_str().unwrap()]) + .arg("-f") + // TODO probably we will have to modify pg_resetwal + // .arg("--compute-node") + .status() + .expect("failed to execute pg_resetwal"); if !pg_resetwal.success() { panic!("pg_resetwal failed"); } } - pub fn start_proxy(&self, wal_acceptors : String) -> WalProposerNode { + pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode { let proxy_path = PG_BIN_DIR.join("safekeeper_proxy"); - match Command::new(proxy_path.as_path()) - .args(&["-s", &wal_acceptors]) - .args(&["-h", &self.ip.to_string()]) - .args(&["-p", &self.port.to_string()]) - .arg("-v") - .stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap()) - .spawn() - { - Ok(child) => - WalProposerNode { pid: child.id() }, - Err(e) => - panic!("Failed to launch {:?}: {}", proxy_path, e) - } - } + match Command::new(proxy_path.as_path()) + .args(&["-s", &wal_acceptors]) + .args(&["-h", &self.ip.to_string()]) + .args(&["-p", &self.port.to_string()]) + .arg("-v") + .stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap()) + .spawn() + { + Ok(child) => WalProposerNode { pid: child.id() }, + Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e), + } + } // TODO pub fn pg_bench() {} @@ -585,19 +616,17 @@ impl Drop for PostgresNode { } } -pub fn regress_check(pg : &PostgresNode) { - +pub fn regress_check(pg: &PostgresNode) { pg.safe_psql("postgres", "CREATE DATABASE regression"); - let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("tmp_check/regress"); + let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress"); fs::create_dir_all(regress_run_path.clone()).unwrap(); std::env::set_current_dir(regress_run_path).unwrap(); - let regress_build_path = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("../tmp_install/build/src/test/regress"); - let regress_src_path = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("../vendor/postgres/src/test/regress"); + let regress_build_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress"); + let regress_src_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress"); let _regress_check = Command::new(regress_build_path.join("pg_regress")) .args(&[ @@ -605,7 +634,11 @@ pub fn regress_check(pg : &PostgresNode) { "--use-existing", format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(), format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), - format!("--schedule={}", regress_src_path.join("parallel_schedule").to_str().unwrap()).as_str(), + format!( + "--schedule={}", + regress_src_path.join("parallel_schedule").to_str().unwrap() + ) + .as_str(), format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), ]) .env_clear() diff --git a/integration_tests/tests/test_control_plane.rs b/integration_tests/tests/test_control_plane.rs index 8357070a36..481cd3d8b3 100644 --- a/integration_tests/tests/test_control_plane.rs +++ b/integration_tests/tests/test_control_plane.rs @@ -1,8 +1,5 @@ +#[test] +fn test_actions() {} #[test] -fn test_actions() { -} - -#[test] -fn test_regress() { -} \ No newline at end of file +fn test_regress() {} diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 5a2e2478fa..8adacb3c54 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -1,4 +1,3 @@ - #[allow(dead_code)] mod control_plane; @@ -21,8 +20,14 @@ fn test_redo_cases() { node.start(&storage_cplane); // check basic work with table - node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - node.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100000), 'payload'"); + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + node.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ); let count: i64 = node .safe_psql("postgres", "SELECT sum(key) FROM t") .first() @@ -70,8 +75,14 @@ fn test_pageserver_multitenancy() { node2.start(&storage_cplane); // check node1 - node1.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - node1.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100000), 'payload'"); + node1.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + node1.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ); let count: i64 = node1 .safe_psql("postgres", "SELECT sum(key) FROM t") .first() @@ -81,8 +92,14 @@ fn test_pageserver_multitenancy() { assert_eq!(count, 5000050000); // check node2 - node2.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - node2.safe_psql("postgres", "INSERT INTO t SELECT generate_series(100000,200000), 'payload'"); + node2.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + node2.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(100000,200000), 'payload'", + ); let count: i64 = node2 .safe_psql("postgres", "SELECT sum(key) FROM t") .first() diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index b7e273bdda..7683768280 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -5,28 +5,34 @@ use control_plane::ComputeControlPlane; use control_plane::StorageControlPlane; use rand::Rng; -use std::{thread, time}; use std::sync::Arc; use std::time::SystemTime; +use std::{thread, time}; #[test] fn test_acceptors_normal_work() { // Start pageserver that reads WAL directly from that postgres - const REDUNDANCY : usize = 3; + const REDUNDANCY: usize = 3; let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY); let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); - let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); + let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgre let node = compute_cplane.new_master_node(); node.start(&storage_cplane); - // start proxy - let _proxy = node.start_proxy(wal_acceptors); + // start proxy + let _proxy = node.start_proxy(wal_acceptors); // check basic work with table - node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - node.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100000), 'payload'"); + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + node.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ); let count: i64 = node .safe_psql("postgres", "SELECT sum(key) FROM t") .first() @@ -41,39 +47,43 @@ fn test_acceptors_normal_work() { #[test] fn test_acceptors_restarts() { // Start pageserver that reads WAL directly from that postgres - const REDUNDANCY : usize = 3; - const FAULT_PROBABILITY : f32 = 0.01; + const REDUNDANCY: usize = 3; + const FAULT_PROBABILITY: f32 = 0.01; let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY); let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); - let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); - let mut rng = rand::thread_rng(); + let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); + let mut rng = rand::thread_rng(); // start postgre let node = compute_cplane.new_master_node(); node.start(&storage_cplane); - // start proxy - let _proxy = node.start_proxy(wal_acceptors); - let mut failed_node : Option = None; + // start proxy + let _proxy = node.start_proxy(wal_acceptors); + let mut failed_node: Option = None; // check basic work with table - node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - let mut psql = node.open_psql("postgres"); - for i in 1..=1000 { - psql.execute("INSERT INTO t values ($1, 'payload')", &[&i]).unwrap(); - let prob : f32 = rng.gen(); - if prob <= FAULT_PROBABILITY { - if let Some(node) = failed_node { - storage_cplane.wal_acceptors[node].start(); - failed_node = None; - } else { - let node : usize = rng.gen_range(0..REDUNDANCY); - failed_node = Some(node); - storage_cplane.wal_acceptors[node].stop(); - } - } - } + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + let mut psql = node.open_psql("postgres"); + for i in 1..=1000 { + psql.execute("INSERT INTO t values ($1, 'payload')", &[&i]) + .unwrap(); + let prob: f32 = rng.gen(); + if prob <= FAULT_PROBABILITY { + if let Some(node) = failed_node { + storage_cplane.wal_acceptors[node].start(); + failed_node = None; + } else { + let node: usize = rng.gen_range(0..REDUNDANCY); + failed_node = Some(node); + storage_cplane.wal_acceptors[node].stop(); + } + } + } let count: i64 = node .safe_psql("postgres", "SELECT sum(key) FROM t") .first() @@ -83,12 +93,12 @@ fn test_acceptors_restarts() { assert_eq!(count, 500500); } -fn start_acceptor(cplane : &Arc, no : usize) { - let cp = cplane.clone(); - thread::spawn(move || { - thread::sleep(time::Duration::from_secs(1)); - cp.wal_acceptors[no].start(); - }); +fn start_acceptor(cplane: &Arc, no: usize) { + let cp = cplane.clone(); + thread::spawn(move || { + thread::sleep(time::Duration::from_secs(1)); + cp.wal_acceptors[no].start(); + }); } // Stop majority of acceptors while compute is under the load. Boot @@ -97,38 +107,46 @@ fn start_acceptor(cplane : &Arc, no : usize) { #[test] fn test_acceptors_unavalability() { // Start pageserver that reads WAL directly from that postgres - const REDUNDANCY : usize = 2; + const REDUNDANCY: usize = 2; let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY); let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); - let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); + let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgre let node = compute_cplane.new_master_node(); node.start(&storage_cplane); - // start proxy - let _proxy = node.start_proxy(wal_acceptors); + // start proxy + let _proxy = node.start_proxy(wal_acceptors); // check basic work with table - node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - let mut psql = node.open_psql("postgres"); - psql.execute("INSERT INTO t values (1, 'payload')", &[]).unwrap(); + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + let mut psql = node.open_psql("postgres"); + psql.execute("INSERT INTO t values (1, 'payload')", &[]) + .unwrap(); - storage_cplane.wal_acceptors[0].stop(); - let ap = Arc::new(storage_cplane); - start_acceptor(&ap, 0); - let now = SystemTime::now(); - psql.execute("INSERT INTO t values (2, 'payload')", &[]).unwrap(); - assert!(now.elapsed().unwrap().as_secs() > 1); - psql.execute("INSERT INTO t values (3, 'payload')", &[]).unwrap(); + storage_cplane.wal_acceptors[0].stop(); + let ap = Arc::new(storage_cplane); + start_acceptor(&ap, 0); + let now = SystemTime::now(); + psql.execute("INSERT INTO t values (2, 'payload')", &[]) + .unwrap(); + assert!(now.elapsed().unwrap().as_secs() > 1); + psql.execute("INSERT INTO t values (3, 'payload')", &[]) + .unwrap(); - ap.wal_acceptors[1].stop(); - start_acceptor(&ap, 1); - psql.execute("INSERT INTO t values (4, 'payload')", &[]).unwrap(); - assert!(now.elapsed().unwrap().as_secs() > 2); + ap.wal_acceptors[1].stop(); + start_acceptor(&ap, 1); + psql.execute("INSERT INTO t values (4, 'payload')", &[]) + .unwrap(); + assert!(now.elapsed().unwrap().as_secs() > 2); - psql.execute("INSERT INTO t values (5, 'payload')", &[]).unwrap(); + psql.execute("INSERT INTO t values (5, 'payload')", &[]) + .unwrap(); let count: i64 = node .safe_psql("postgres", "SELECT sum(key) FROM t") @@ -139,55 +157,59 @@ fn test_acceptors_unavalability() { assert_eq!(count, 15); } -fn simulate_failures(cplane : &Arc) { - let mut rng = rand::thread_rng(); - let n_acceptors = cplane.wal_acceptors.len(); - let failure_period = time::Duration::from_secs(1); - loop { - thread::sleep(failure_period); - let mask : u32 = rng.gen_range(0..(1 << n_acceptors)); - for i in 0..n_acceptors { - if (mask & (1 << i)) != 0 { - cplane.wal_acceptors[i].stop(); - } - } - thread::sleep(failure_period); - for i in 0..n_acceptors { - if (mask & (1 << i)) != 0 { - cplane.wal_acceptors[i].start(); - } - } - } +fn simulate_failures(cplane: &Arc) { + let mut rng = rand::thread_rng(); + let n_acceptors = cplane.wal_acceptors.len(); + let failure_period = time::Duration::from_secs(1); + loop { + thread::sleep(failure_period); + let mask: u32 = rng.gen_range(0..(1 << n_acceptors)); + for i in 0..n_acceptors { + if (mask & (1 << i)) != 0 { + cplane.wal_acceptors[i].stop(); + } + } + thread::sleep(failure_period); + for i in 0..n_acceptors { + if (mask & (1 << i)) != 0 { + cplane.wal_acceptors[i].start(); + } + } + } } // Race condition test #[test] fn test_race_conditions() { // Start pageserver that reads WAL directly from that postgres - const REDUNDANCY : usize = 3; + const REDUNDANCY: usize = 3; let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY); let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); - let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); + let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgre let node = compute_cplane.new_master_node(); node.start(&storage_cplane); - // start proxy - let _proxy = node.start_proxy(wal_acceptors); + // start proxy + let _proxy = node.start_proxy(wal_acceptors); // check basic work with table - node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)"); - let cp = Arc::new(storage_cplane); - thread::spawn(move || { - simulate_failures(&cp); - }); + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + let cp = Arc::new(storage_cplane); + thread::spawn(move || { + simulate_failures(&cp); + }); - let mut psql = node.open_psql("postgres"); - for i in 1..=1000 { - psql.execute("INSERT INTO t values ($1, 'payload')", &[&i]).unwrap(); - } + let mut psql = node.open_psql("postgres"); + for i in 1..=1000 { + psql.execute("INSERT INTO t values ($1, 'payload')", &[&i]) + .unwrap(); + } let count: i64 = node .safe_psql("postgres", "SELECT sum(key) FROM t") .first() diff --git a/pageserver/src/bin/cli/main.rs b/pageserver/src/bin/cli/main.rs index fffdfb9a97..4aa3269c09 100644 --- a/pageserver/src/bin/cli/main.rs +++ b/pageserver/src/bin/cli/main.rs @@ -1,13 +1,12 @@ +use anyhow::Result; use clap::{App, AppSettings}; -use anyhow::{Result}; -mod subcommand; pub mod pg; -pub mod storage; pub mod snapshot; +pub mod storage; +mod subcommand; fn main() -> Result<()> { - let cli_commands = subcommand::ClapCommands { commands: vec![ Box::new(pg::PgCmd { @@ -22,7 +21,6 @@ fn main() -> Result<()> { ], }; - let matches = App::new("zenith") .about("Zenith CLI") .version("1.0") @@ -30,7 +28,6 @@ fn main() -> Result<()> { .subcommands(cli_commands.generate()) .get_matches(); - if let Some(subcommand) = matches.subcommand_name() { println!("'git {}' was used", subcommand); } diff --git a/pageserver/src/bin/cli/pg.rs b/pageserver/src/bin/cli/pg.rs index a2444d1c0d..7fe2f86d6c 100644 --- a/pageserver/src/bin/cli/pg.rs +++ b/pageserver/src/bin/cli/pg.rs @@ -1,10 +1,8 @@ +use anyhow::Result; use clap::{App, AppSettings, Arg}; -use anyhow::{Result}; use crate::subcommand; - - pub struct PgCmd<'a> { pub clap_cmd: clap::App<'a, 'a>, } @@ -13,81 +11,95 @@ impl subcommand::SubCommand for PgCmd<'_> { fn gen_clap_command(&self) -> clap::App { let c = self.clap_cmd.clone(); c.about("Operations with zenith compute nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand( - App::new("list") - .about("List existing compute nodes") - ) - .subcommand( - App::new("create") - .about("Create (init) new data directory using given storage and start postgres") - .arg(Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node")) - .arg(Arg::with_name("storage") - .short("s") - .long("storage") - .takes_value(true) - .help("Name of the storage node to use")) - //TODO should it be just name of uploaded snapshot or some path? - .arg(Arg::with_name("snapshot") - .long("snapshot") - .takes_value(true) - .help("Name of the snapshot to use")) - .arg(Arg::with_name("nostart") - .long("no-start") - .takes_value(false) - .help("Don't start postgres on the created node")) - ) - .subcommand( - App::new("destroy") - .about("Stop postgres and destroy node's data directory") - .arg(Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node")) - ) - .subcommand( - App::new("start") - .about("Start postgres on the given node") - .arg(Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node")) - .arg(Arg::with_name("replica") - .long("replica") - .takes_value(false) - .help("Start the compute node as replica")) - ) - .subcommand( - App::new("stop") - .about("Stop postgres on the given node") - .arg(Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node")) - - ) - .subcommand( - App::new("show") - .about("Show info about the given node") - .arg(Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node")) - - ) + .setting(AppSettings::SubcommandRequiredElseHelp) + .subcommand(App::new("list").about("List existing compute nodes")) + .subcommand( + App::new("create") + .about( + "Create (init) new data directory using given storage and start postgres", + ) + .arg( + Arg::with_name("name") + .short("n") + .long("name") + .takes_value(true) + .help("Name of the compute node"), + ) + .arg( + Arg::with_name("storage") + .short("s") + .long("storage") + .takes_value(true) + .help("Name of the storage node to use"), + ) + //TODO should it be just name of uploaded snapshot or some path? + .arg( + Arg::with_name("snapshot") + .long("snapshot") + .takes_value(true) + .help("Name of the snapshot to use"), + ) + .arg( + Arg::with_name("nostart") + .long("no-start") + .takes_value(false) + .help("Don't start postgres on the created node"), + ), + ) + .subcommand( + App::new("destroy") + .about("Stop postgres and destroy node's data directory") + .arg( + Arg::with_name("name") + .short("n") + .long("name") + .takes_value(true) + .help("Name of the compute node"), + ), + ) + .subcommand( + App::new("start") + .about("Start postgres on the given node") + .arg( + Arg::with_name("name") + .short("n") + .long("name") + .takes_value(true) + .help("Name of the compute node"), + ) + .arg( + Arg::with_name("replica") + .long("replica") + .takes_value(false) + .help("Start the compute node as replica"), + ), + ) + .subcommand( + App::new("stop") + .about("Stop postgres on the given node") + .arg( + Arg::with_name("name") + .short("n") + .long("name") + .takes_value(true) + .help("Name of the compute node"), + ), + ) + .subcommand( + App::new("show") + .about("Show info about the given node") + .arg( + Arg::with_name("name") + .short("n") + .long("name") + .takes_value(true) + .help("Name of the compute node"), + ), + ) } fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run PgCmd with args {:?}", args); Ok(()) } -} \ No newline at end of file +} diff --git a/pageserver/src/bin/cli/snapshot.rs b/pageserver/src/bin/cli/snapshot.rs index c149d7318e..47e608b8e2 100644 --- a/pageserver/src/bin/cli/snapshot.rs +++ b/pageserver/src/bin/cli/snapshot.rs @@ -1,5 +1,5 @@ +use anyhow::Result; use clap::{App, AppSettings, Arg}; -use anyhow::{Result}; use crate::subcommand; @@ -11,31 +11,17 @@ impl subcommand::SubCommand for SnapshotCmd<'_> { fn gen_clap_command(&self) -> clap::App { let c = self.clap_cmd.clone(); c.about("Operations with zenith snapshots") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand( - App::new("list") - ) - .subcommand( - App::new("create") - .arg(Arg::with_name("pgdata").required(true)), - ) - .subcommand( - App::new("destroy") - ) - .subcommand( - App::new("start") - ) - .subcommand( - App::new("stop") - ) - .subcommand( - App::new("show") - ) + .setting(AppSettings::SubcommandRequiredElseHelp) + .subcommand(App::new("list")) + .subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true))) + .subcommand(App::new("destroy")) + .subcommand(App::new("start")) + .subcommand(App::new("stop")) + .subcommand(App::new("show")) } fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run SnapshotCmd with args {:?}", args); Ok(()) } -} \ No newline at end of file +} diff --git a/pageserver/src/bin/cli/storage.rs b/pageserver/src/bin/cli/storage.rs index 9f37c38ac7..71ca61e905 100644 --- a/pageserver/src/bin/cli/storage.rs +++ b/pageserver/src/bin/cli/storage.rs @@ -1,5 +1,5 @@ +use anyhow::Result; use clap::{App, AppSettings}; -use anyhow::{Result}; use crate::subcommand; @@ -11,24 +11,15 @@ impl subcommand::SubCommand for StorageCmd<'_> { fn gen_clap_command(&self) -> clap::App { let c = self.clap_cmd.clone(); c.about("Operations with zenith storage nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand( - App::new("list") - ) - .subcommand( - App::new("attach") - ) - .subcommand( - App::new("detach") - ) - .subcommand( - App::new("show") - ) + .setting(AppSettings::SubcommandRequiredElseHelp) + .subcommand(App::new("list")) + .subcommand(App::new("attach")) + .subcommand(App::new("detach")) + .subcommand(App::new("show")) } fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run StorageCmd with args {:?}", args); Ok(()) } -} \ No newline at end of file +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d9d04abfd9..9fdf405145 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,19 +3,19 @@ // use log::*; -use std::{fs::File, str::FromStr, fs::OpenOptions}; +use std::fs; use std::io; use std::path::PathBuf; use std::thread; -use std::fs; +use std::{fs::File, fs::OpenOptions, str::FromStr}; use clap::{App, Arg}; use daemonize::Daemonize; use slog; -use slog_stdlog; -use slog_scope; use slog::Drain; +use slog_scope; +use slog_stdlog; use pageserver::page_service; use pageserver::restore_s3; @@ -129,8 +129,16 @@ fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> { // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fpritf's or backtraces. - let stdout = OpenOptions::new().create(true).append(true).open(conf.data_dir.join("pageserver.log")).unwrap(); - let stderr = OpenOptions::new().create(true).append(true).open(conf.data_dir.join("pageserver.log")).unwrap(); + let stdout = OpenOptions::new() + .create(true) + .append(true) + .open(conf.data_dir.join("pageserver.log")) + .unwrap(); + let stderr = OpenOptions::new() + .create(true) + .append(true) + .open(conf.data_dir.join("pageserver.log")) + .unwrap(); let daemonize = Daemonize::new() .pid_file(conf.data_dir.join("pageserver.pid")) @@ -157,13 +165,13 @@ fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> { // Create directory for wal-redo datadirs match fs::create_dir(conf.data_dir.join("wal-redo")) { - Ok(_) => {}, + Ok(_) => {} Err(e) => match e.kind() { io::ErrorKind::AlreadyExists => {} _ => { panic!("Failed to create wal-redo data directory: {}", e); } - } + }, } // Launch the WAL receiver thread if pageserver was started with --wal-producer diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index d893185987..b504308e6b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,12 +4,12 @@ use std::path::PathBuf; pub mod page_cache; pub mod page_service; pub mod restore_s3; -pub mod waldecoder; -pub mod walreceiver; -pub mod walredo; pub mod tui; pub mod tui_event; mod tui_logger; +pub mod waldecoder; +pub mod walreceiver; +pub mod walredo; #[allow(dead_code)] #[derive(Debug, Clone)] diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 918f7c6108..8f360450a7 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -7,24 +7,24 @@ // use core::ops::Bound::Included; -use std::{convert::TryInto, ops::AddAssign}; use std::collections::{BTreeMap, HashMap}; use std::error::Error; -use std::sync::{Arc,Condvar, Mutex}; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; -use std::time::Duration; +use std::sync::{Arc, Condvar, Mutex}; use std::thread; +use std::time::Duration; +use std::{convert::TryInto, ops::AddAssign}; // use tokio::sync::RwLock; use bytes::Bytes; use lazy_static::lazy_static; -use rand::Rng; use log::*; +use rand::Rng; -use crate::{PageServerConf, walredo}; +use crate::{walredo, PageServerConf}; use crossbeam_channel::unbounded; -use crossbeam_channel::{Sender, Receiver}; +use crossbeam_channel::{Receiver, Sender}; // Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call. static TIMEOUT: Duration = Duration::from_secs(60); @@ -63,7 +63,6 @@ pub struct PageCacheStats { } impl AddAssign for PageCacheStats { - fn add_assign(&mut self, other: Self) { *self = Self { num_entries: self.num_entries + other.num_entries, @@ -81,7 +80,6 @@ impl AddAssign for PageCacheStats { // Shared data structure, holding page cache and related auxiliary information // struct PageCacheShared { - // The actual page cache pagecache: BTreeMap>, @@ -112,10 +110,10 @@ struct PageCacheShared { } lazy_static! { - pub static ref PAGECACHES : Mutex>> = Mutex::new(HashMap::new()); + pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); } -pub fn get_pagecahe(conf: PageServerConf, sys_id : u64) -> Arc { +pub fn get_pagecahe(conf: PageServerConf, sys_id: u64) -> Arc { let mut pcaches = PAGECACHES.lock().unwrap(); if !pcaches.contains_key(&sys_id) { @@ -137,20 +135,18 @@ pub fn get_pagecahe(conf: PageServerConf, sys_id : u64) -> Arc { pcaches.get(&sys_id).unwrap().clone() } -fn init_page_cache() -> PageCache -{ +fn init_page_cache() -> PageCache { // Initialize the channel between the page cache and the WAL applicator let (s, r) = unbounded(); PageCache { - shared: Mutex::new( - PageCacheShared { - pagecache: BTreeMap::new(), - relsize_cache: HashMap::new(), - first_valid_lsn: 0, - last_valid_lsn: 0, - last_record_lsn: 0, - }), + shared: Mutex::new(PageCacheShared { + pagecache: BTreeMap::new(), + relsize_cache: HashMap::new(), + first_valid_lsn: 0, + last_valid_lsn: 0, + last_record_lsn: 0, + }), valid_lsn_condvar: Condvar::new(), walredo_sender: s, @@ -165,10 +161,8 @@ fn init_page_cache() -> PageCache last_valid_lsn: AtomicU64::new(0), last_record_lsn: AtomicU64::new(0), } - } - // // We store two kinds of entries in the page cache: // @@ -185,7 +179,7 @@ fn init_page_cache() -> PageCache #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] pub struct CacheKey { pub tag: BufferTag, - pub lsn: u64 + pub lsn: u64, } pub struct CacheEntry { @@ -198,7 +192,7 @@ pub struct CacheEntry { // // FIXME: this takes quite a lot of space. Consider using parking_lot::Condvar // or something else. - pub walredo_condvar: Condvar + pub walredo_condvar: Condvar, } pub struct CacheEntryContent { @@ -221,7 +215,6 @@ impl CacheEntry { } } - #[derive(Eq, PartialEq, Hash, Clone, Copy)] pub struct RelTag { pub spcnode: u32, @@ -241,415 +234,409 @@ pub struct BufferTag { #[derive(Clone)] pub struct WALRecord { - pub lsn: u64, // LSN at the *end* of the record + pub lsn: u64, // LSN at the *end* of the record pub will_init: bool, - pub rec: Bytes + pub rec: Bytes, } - // Public interface functions impl PageCache { - -// -// GetPage@LSN -// -// Returns an 8k page image -// -pub fn get_page_at_lsn(&self, tag: BufferTag, lsn: u64) -> Result> -{ - self.num_getpage_requests.fetch_add(1, Ordering::Relaxed); - - // Look up cache entry. If it's a page image, return that. If it's a WAL record, - // ask the WAL redo service to reconstruct the page image from the WAL records. - let minkey = CacheKey { tag: tag, lsn: 0 }; - let maxkey = CacheKey { tag: tag, lsn: lsn }; - - let entry_rc: Arc; - { - let mut shared = self.shared.lock().unwrap(); - let mut waited = false; - - while lsn > shared.last_valid_lsn { - // TODO: Wait for the WAL receiver to catch up - waited = true; - trace!("not caught up yet: {}, requested {}", shared.last_valid_lsn, lsn); - let wait_result = self.valid_lsn_condvar.wait_timeout(shared, TIMEOUT).unwrap(); - - shared = wait_result.0; - if wait_result.1.timed_out() { - return Err(format!("Timed out while waiting for WAL record at LSN {} to arrive", lsn))?; - } - } - if waited { - trace!("caught up now, continuing"); - } - - if lsn < shared.first_valid_lsn { - return Err(format!("LSN {} has already been removed", lsn))?; - } - - let pagecache = &shared.pagecache; - - let mut entries = pagecache.range((Included(&minkey), Included(&maxkey))); - - let entry_opt = entries.next_back(); - - if entry_opt.is_none() { - static ZERO_PAGE:[u8; 8192] = [0 as u8; 8192]; - return Ok(Bytes::from_static(&ZERO_PAGE)); - /* return Err("could not find page image")?; */ - } - let (_key, entry) = entry_opt.unwrap(); - entry_rc = entry.clone(); - - // Now that we have a reference to the cache entry, drop the lock on the map. - // It's important to do this before waiting on the condition variable below, - // and better to do it as soon as possible to maximize concurrency. - } - - // Lock the cache entry and dig the page image out of it. - let page_img: Bytes; - { - let mut entry_content = entry_rc.content.lock().unwrap(); - - if let Some(img) = &entry_content.page_image { - assert!(!entry_content.apply_pending); - page_img = img.clone(); - } else if entry_content.wal_record.is_some() { - - // - // If this page needs to be reconstructed by applying some WAL, - // send a request to the WAL redo thread. - // - if !entry_content.apply_pending { - assert!(!entry_content.apply_pending); - entry_content.apply_pending = true; - - let s = &self.walredo_sender; - s.send(entry_rc.clone())?; - } - - while entry_content.apply_pending { - entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); - } - - // We should now have a page image. If we don't, it means that WAL redo - // failed to reconstruct it. WAL redo should've logged that error already. - page_img = match &entry_content.page_image { - Some(p) => p.clone(), - None => { - error!("could not apply WAL to reconstruct page image for GetPage@LSN request"); - return Err("could not apply WAL to reconstruct page image".into()); - } - }; - - } else { - // No base image, and no WAL record. Huh? - return Err(format!("no page image or WAL record for requested page"))?; - } - } - - // FIXME: assumes little-endian. Only used for the debugging log though - let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap()); - let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap()); - trace!("Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}", page_lsn_hi, page_lsn_lo, - tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum); - - return Ok(page_img); -} - -// -// Collect all the WAL records that are needed to reconstruct a page -// image for the given cache entry. -// -// Returns an old page image (if any), and a vector of WAL records to apply -// over it. -// -pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option, Vec) -{ - // Scan the BTreeMap backwards, starting from the given entry. - let shared = self.shared.lock().unwrap(); - let pagecache = &shared.pagecache; - - let minkey = CacheKey { - tag: entry.key.tag, - lsn: 0 - }; - let maxkey = CacheKey { - tag: entry.key.tag, - lsn: entry.key.lsn - }; - let entries = pagecache.range((Included(&minkey), Included(&maxkey))); - - // the last entry in the range should be the CacheEntry we were given - //let _last_entry = entries.next_back(); - //assert!(last_entry == entry); - - let mut base_img: Option = None; - let mut records: Vec = Vec::new(); - - // Scan backwards, collecting the WAL records, until we hit an - // old page image. - for (_key, e) in entries.rev() { - let e = e.content.lock().unwrap(); - - if let Some(img) = &e.page_image { - // We have a base image. No need to dig deeper into the list of - // records - base_img = Some(img.clone()); - break; - } else if let Some(rec) = &e.wal_record { - - records.push(rec.clone()); - - // If this WAL record initializes the page, no need to dig deeper. - if rec.will_init { - break; - } - } else { - panic!("no base image and no WAL record on cache entry"); - } - } - - records.reverse(); - return (base_img, records); -} - - -// -// Adds a WAL record to the page cache -// -pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) -{ - let key = CacheKey { - tag: tag, - lsn: rec.lsn - }; - - let entry = CacheEntry::new(key.clone()); - entry.content.lock().unwrap().wal_record = Some(rec); - - let mut shared = self.shared.lock().unwrap(); - - let rel_tag = RelTag { - spcnode: tag.spcnode, - dbnode: tag.dbnode, - relnode: tag.relnode, - forknum: tag.forknum, - }; - let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0); - if tag.blknum >= *rel_entry { - *rel_entry = tag.blknum + 1; - } - - trace!("put_wal_record lsn: {}", key.lsn); - - let oldentry = shared.pagecache.insert(key, Arc::new(entry)); - self.num_entries.fetch_add(1, Ordering::Relaxed); - - if !oldentry.is_none() { - error!("overwriting WAL record in page cache"); - } - - self.num_wal_records.fetch_add(1, Ordering::Relaxed); -} - -// -// Memorize a full image of a page version -// -pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) -{ - let key = CacheKey { - tag: tag, - lsn: lsn - }; - - let entry = CacheEntry::new(key.clone()); - entry.content.lock().unwrap().page_image = Some(img); - - let mut shared = self.shared.lock().unwrap(); - let pagecache = &mut shared.pagecache; - - let oldentry = pagecache.insert(key, Arc::new(entry)); - self.num_entries.fetch_add(1, Ordering::Relaxed); - assert!(oldentry.is_none()); - - //debug!("inserted page image for {}/{}/{}_{} blk {} at {}", - // tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn); - - self.num_page_images.fetch_add(1, Ordering::Relaxed); -} - -// -pub fn advance_last_valid_lsn(&self, lsn: u64) -{ - let mut shared = self.shared.lock().unwrap(); - - // Can't move backwards. - assert!(lsn >= shared.last_valid_lsn); - - shared.last_valid_lsn = lsn; - self.valid_lsn_condvar.notify_all(); - - self.last_valid_lsn.store(lsn, Ordering::Relaxed); -} - -// -// NOTE: this updates last_valid_lsn as well. -// -pub fn advance_last_record_lsn(&self, lsn: u64) -{ - let mut shared = self.shared.lock().unwrap(); - - // Can't move backwards. - assert!(lsn >= shared.last_valid_lsn); - assert!(lsn >= shared.last_record_lsn); - - shared.last_valid_lsn = lsn; - shared.last_record_lsn = lsn; - self.valid_lsn_condvar.notify_all(); - - self.last_valid_lsn.store(lsn, Ordering::Relaxed); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); -} - -// -pub fn _advance_first_valid_lsn(&self, lsn: u64) -{ - let mut shared = self.shared.lock().unwrap(); - - // Can't move backwards. - assert!(lsn >= shared.first_valid_lsn); - - // Can't overtake last_valid_lsn (except when we're - // initializing the system and last_valid_lsn hasn't been set yet. - assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn); - - shared.first_valid_lsn = lsn; - self.first_valid_lsn.store(lsn, Ordering::Relaxed); -} - -pub fn init_valid_lsn(&self, lsn: u64) -{ - let mut shared = self.shared.lock().unwrap(); - - assert!(shared.first_valid_lsn == 0); - assert!(shared.last_valid_lsn == 0); - assert!(shared.last_record_lsn == 0); - - shared.first_valid_lsn = lsn; - shared.last_valid_lsn = lsn; - shared.last_record_lsn = lsn; - - self.first_valid_lsn.store(lsn, Ordering::Relaxed); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); - self.last_record_lsn.store(lsn, Ordering::Relaxed); -} - -pub fn get_last_valid_lsn(&self) -> u64 -{ - let shared = self.shared.lock().unwrap(); - - return shared.last_record_lsn; -} - -// -// Simple test function for the WAL redo code: -// -// 1. Pick a page from the page cache at random. -// 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version) -// -// -pub fn _test_get_page_at_lsn(&self) -{ - // for quick testing of the get_page_at_lsn() funcion. // - // Get a random page from the page cache. Apply all its WAL, by requesting - // that page at the highest lsn. + // GetPage@LSN + // + // Returns an 8k page image + // + pub fn get_page_at_lsn(&self, tag: BufferTag, lsn: u64) -> Result> { + self.num_getpage_requests.fetch_add(1, Ordering::Relaxed); - let mut tag: Option = None; + // Look up cache entry. If it's a page image, return that. If it's a WAL record, + // ask the WAL redo service to reconstruct the page image from the WAL records. + let minkey = CacheKey { tag: tag, lsn: 0 }; + let maxkey = CacheKey { tag: tag, lsn: lsn }; - { + let entry_rc: Arc; + { + let mut shared = self.shared.lock().unwrap(); + let mut waited = false; + + while lsn > shared.last_valid_lsn { + // TODO: Wait for the WAL receiver to catch up + waited = true; + trace!( + "not caught up yet: {}, requested {}", + shared.last_valid_lsn, + lsn + ); + let wait_result = self + .valid_lsn_condvar + .wait_timeout(shared, TIMEOUT) + .unwrap(); + + shared = wait_result.0; + if wait_result.1.timed_out() { + return Err(format!( + "Timed out while waiting for WAL record at LSN {} to arrive", + lsn + ))?; + } + } + if waited { + trace!("caught up now, continuing"); + } + + if lsn < shared.first_valid_lsn { + return Err(format!("LSN {} has already been removed", lsn))?; + } + + let pagecache = &shared.pagecache; + + let mut entries = pagecache.range((Included(&minkey), Included(&maxkey))); + + let entry_opt = entries.next_back(); + + if entry_opt.is_none() { + static ZERO_PAGE: [u8; 8192] = [0 as u8; 8192]; + return Ok(Bytes::from_static(&ZERO_PAGE)); + /* return Err("could not find page image")?; */ + } + let (_key, entry) = entry_opt.unwrap(); + entry_rc = entry.clone(); + + // Now that we have a reference to the cache entry, drop the lock on the map. + // It's important to do this before waiting on the condition variable below, + // and better to do it as soon as possible to maximize concurrency. + } + + // Lock the cache entry and dig the page image out of it. + let page_img: Bytes; + { + let mut entry_content = entry_rc.content.lock().unwrap(); + + if let Some(img) = &entry_content.page_image { + assert!(!entry_content.apply_pending); + page_img = img.clone(); + } else if entry_content.wal_record.is_some() { + // + // If this page needs to be reconstructed by applying some WAL, + // send a request to the WAL redo thread. + // + if !entry_content.apply_pending { + assert!(!entry_content.apply_pending); + entry_content.apply_pending = true; + + let s = &self.walredo_sender; + s.send(entry_rc.clone())?; + } + + while entry_content.apply_pending { + entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); + } + + // We should now have a page image. If we don't, it means that WAL redo + // failed to reconstruct it. WAL redo should've logged that error already. + page_img = match &entry_content.page_image { + Some(p) => p.clone(), + None => { + error!( + "could not apply WAL to reconstruct page image for GetPage@LSN request" + ); + return Err("could not apply WAL to reconstruct page image".into()); + } + }; + } else { + // No base image, and no WAL record. Huh? + return Err(format!("no page image or WAL record for requested page"))?; + } + } + + // FIXME: assumes little-endian. Only used for the debugging log though + let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap()); + let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap()); + trace!( + "Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}", + page_lsn_hi, + page_lsn_lo, + tag.spcnode, + tag.dbnode, + tag.relnode, + tag.forknum, + tag.blknum + ); + + return Ok(page_img); + } + + // + // Collect all the WAL records that are needed to reconstruct a page + // image for the given cache entry. + // + // Returns an old page image (if any), and a vector of WAL records to apply + // over it. + // + pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option, Vec) { + // Scan the BTreeMap backwards, starting from the given entry. let shared = self.shared.lock().unwrap(); let pagecache = &shared.pagecache; - if pagecache.is_empty() { - info!("page cache is empty"); - return; - } + let minkey = CacheKey { + tag: entry.key.tag, + lsn: 0, + }; + let maxkey = CacheKey { + tag: entry.key.tag, + lsn: entry.key.lsn, + }; + let entries = pagecache.range((Included(&minkey), Included(&maxkey))); - // Find nth entry in the map, where n is picked at random - let n = rand::thread_rng().gen_range(0..pagecache.len()); - let mut i = 0; - for (key, _e) in pagecache.iter() { - if i == n { - tag = Some(key.tag); + // the last entry in the range should be the CacheEntry we were given + //let _last_entry = entries.next_back(); + //assert!(last_entry == entry); + + let mut base_img: Option = None; + let mut records: Vec = Vec::new(); + + // Scan backwards, collecting the WAL records, until we hit an + // old page image. + for (_key, e) in entries.rev() { + let e = e.content.lock().unwrap(); + + if let Some(img) = &e.page_image { + // We have a base image. No need to dig deeper into the list of + // records + base_img = Some(img.clone()); break; + } else if let Some(rec) = &e.wal_record { + records.push(rec.clone()); + + // If this WAL record initializes the page, no need to dig deeper. + if rec.will_init { + break; + } + } else { + panic!("no base image and no WAL record on cache entry"); + } + } + + records.reverse(); + return (base_img, records); + } + + // + // Adds a WAL record to the page cache + // + pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) { + let key = CacheKey { + tag: tag, + lsn: rec.lsn, + }; + + let entry = CacheEntry::new(key.clone()); + entry.content.lock().unwrap().wal_record = Some(rec); + + let mut shared = self.shared.lock().unwrap(); + + let rel_tag = RelTag { + spcnode: tag.spcnode, + dbnode: tag.dbnode, + relnode: tag.relnode, + forknum: tag.forknum, + }; + let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0); + if tag.blknum >= *rel_entry { + *rel_entry = tag.blknum + 1; + } + + trace!("put_wal_record lsn: {}", key.lsn); + + let oldentry = shared.pagecache.insert(key, Arc::new(entry)); + self.num_entries.fetch_add(1, Ordering::Relaxed); + + if !oldentry.is_none() { + error!("overwriting WAL record in page cache"); + } + + self.num_wal_records.fetch_add(1, Ordering::Relaxed); + } + + // + // Memorize a full image of a page version + // + pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) { + let key = CacheKey { tag: tag, lsn: lsn }; + + let entry = CacheEntry::new(key.clone()); + entry.content.lock().unwrap().page_image = Some(img); + + let mut shared = self.shared.lock().unwrap(); + let pagecache = &mut shared.pagecache; + + let oldentry = pagecache.insert(key, Arc::new(entry)); + self.num_entries.fetch_add(1, Ordering::Relaxed); + assert!(oldentry.is_none()); + + //debug!("inserted page image for {}/{}/{}_{} blk {} at {}", + // tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn); + + self.num_page_images.fetch_add(1, Ordering::Relaxed); + } + + // + pub fn advance_last_valid_lsn(&self, lsn: u64) { + let mut shared = self.shared.lock().unwrap(); + + // Can't move backwards. + assert!(lsn >= shared.last_valid_lsn); + + shared.last_valid_lsn = lsn; + self.valid_lsn_condvar.notify_all(); + + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } + + // + // NOTE: this updates last_valid_lsn as well. + // + pub fn advance_last_record_lsn(&self, lsn: u64) { + let mut shared = self.shared.lock().unwrap(); + + // Can't move backwards. + assert!(lsn >= shared.last_valid_lsn); + assert!(lsn >= shared.last_record_lsn); + + shared.last_valid_lsn = lsn; + shared.last_record_lsn = lsn; + self.valid_lsn_condvar.notify_all(); + + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } + + // + pub fn _advance_first_valid_lsn(&self, lsn: u64) { + let mut shared = self.shared.lock().unwrap(); + + // Can't move backwards. + assert!(lsn >= shared.first_valid_lsn); + + // Can't overtake last_valid_lsn (except when we're + // initializing the system and last_valid_lsn hasn't been set yet. + assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn); + + shared.first_valid_lsn = lsn; + self.first_valid_lsn.store(lsn, Ordering::Relaxed); + } + + pub fn init_valid_lsn(&self, lsn: u64) { + let mut shared = self.shared.lock().unwrap(); + + assert!(shared.first_valid_lsn == 0); + assert!(shared.last_valid_lsn == 0); + assert!(shared.last_record_lsn == 0); + + shared.first_valid_lsn = lsn; + shared.last_valid_lsn = lsn; + shared.last_record_lsn = lsn; + + self.first_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_record_lsn.store(lsn, Ordering::Relaxed); + } + + pub fn get_last_valid_lsn(&self) -> u64 { + let shared = self.shared.lock().unwrap(); + + return shared.last_record_lsn; + } + + // + // Simple test function for the WAL redo code: + // + // 1. Pick a page from the page cache at random. + // 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version) + // + // + pub fn _test_get_page_at_lsn(&self) { + // for quick testing of the get_page_at_lsn() funcion. + // + // Get a random page from the page cache. Apply all its WAL, by requesting + // that page at the highest lsn. + + let mut tag: Option = None; + + { + let shared = self.shared.lock().unwrap(); + let pagecache = &shared.pagecache; + + if pagecache.is_empty() { + info!("page cache is empty"); + return; + } + + // Find nth entry in the map, where n is picked at random + let n = rand::thread_rng().gen_range(0..pagecache.len()); + let mut i = 0; + for (key, _e) in pagecache.iter() { + if i == n { + tag = Some(key.tag); + break; + } + i += 1; + } + } + + info!("testing GetPage@LSN for block {}", tag.unwrap().blknum); + match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) { + Ok(_img) => { + // This prints out the whole page image. + //println!("{:X?}", img); + } + Err(error) => { + error!("GetPage@LSN failed: {}", error); } - i += 1; } } - info!("testing GetPage@LSN for block {}", tag.unwrap().blknum); - match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) { - Ok(_img) => { - // This prints out the whole page image. - //println!("{:X?}", img); - }, - Err(error) => { - error!("GetPage@LSN failed: {}", error); + // FIXME: Shouldn't relation size also be tracked with an LSN? + // If a replica is lagging behind, it needs to get the size as it was on + // the replica's current replay LSN. + pub fn relsize_inc(&self, rel: &RelTag, to: Option) { + let mut shared = self.shared.lock().unwrap(); + let entry = shared.relsize_cache.entry(*rel).or_insert(0); + + if let Some(to) = to { + if to >= *entry { + *entry = to + 1; + } + } + } + + pub fn relsize_get(&self, rel: &RelTag) -> u32 { + let mut shared = self.shared.lock().unwrap(); + let entry = shared.relsize_cache.entry(*rel).or_insert(0); + *entry + } + + pub fn relsize_exist(&self, rel: &RelTag) -> bool { + let shared = self.shared.lock().unwrap(); + let relsize_cache = &shared.relsize_cache; + relsize_cache.contains_key(rel) + } + + pub fn get_stats(&self) -> PageCacheStats { + PageCacheStats { + num_entries: self.num_entries.load(Ordering::Relaxed), + num_page_images: self.num_page_images.load(Ordering::Relaxed), + num_wal_records: self.num_wal_records.load(Ordering::Relaxed), + num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed), + first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed), + last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed), + last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed), } } } - -// FIXME: Shouldn't relation size also be tracked with an LSN? -// If a replica is lagging behind, it needs to get the size as it was on -// the replica's current replay LSN. -pub fn relsize_inc(&self, rel: &RelTag, to: Option) -{ - let mut shared = self.shared.lock().unwrap(); - let entry = shared.relsize_cache.entry(*rel).or_insert(0); - - if let Some(to) = to { - if to >= *entry { - *entry = to + 1; - } - } -} - -pub fn relsize_get(&self, rel: &RelTag) -> u32 -{ - let mut shared = self.shared.lock().unwrap(); - let entry = shared.relsize_cache.entry(*rel).or_insert(0); - *entry -} - -pub fn relsize_exist(&self, rel: &RelTag) -> bool -{ - let shared = self.shared.lock().unwrap(); - let relsize_cache = &shared.relsize_cache; - relsize_cache.contains_key(rel) -} - -pub fn get_stats(&self) -> PageCacheStats -{ - PageCacheStats { - num_entries: self.num_entries.load(Ordering::Relaxed), - num_page_images: self.num_page_images.load(Ordering::Relaxed), - num_wal_records: self.num_wal_records.load(Ordering::Relaxed), - num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed), - first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed), - last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed), - last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed), - } -} - -} - -pub fn get_stats() -> PageCacheStats -{ +pub fn get_stats() -> PageCacheStats { let pcaches = PAGECACHES.lock().unwrap(); let mut stats = PageCacheStats { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 91fe69a553..a5015d4e97 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -10,19 +10,19 @@ // *callmemaybe $url* -- ask pageserver to start walreceiver on $url // -use tokio::net::{TcpListener, TcpStream}; -use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; -use tokio::runtime; -use tokio::task; use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, Bytes, BytesMut}; -use std::{io}; -use std::thread; use log::*; +use std::io; +use std::thread; +use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::runtime; +use tokio::task; use crate::page_cache; -use crate::PageServerConf; use crate::walreceiver; +use crate::PageServerConf; type Result = std::result::Result; @@ -81,7 +81,7 @@ struct ZenithStatusResponse { struct ZenithReadResponse { ok: bool, n_blocks: u32, - page: Bytes + page: Bytes, } #[derive(Debug)] @@ -95,15 +95,15 @@ enum StartupRequestCode { Cancel, NegotiateSsl, NegotiateGss, - Normal + Normal, } impl FeStartupMessage { pub fn parse(buf: &mut BytesMut) -> Result> { const MAX_STARTUP_PACKET_LENGTH: u32 = 10000; const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678; - const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679; - const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680; + const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679; + const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680; if buf.len() < 4 { return Ok(None); @@ -123,11 +123,14 @@ impl FeStartupMessage { CANCEL_REQUEST_CODE => StartupRequestCode::Cancel, NEGOTIATE_SSL_CODE => StartupRequestCode::NegotiateSsl, NEGOTIATE_GSS_CODE => StartupRequestCode::NegotiateGss, - _ => StartupRequestCode::Normal + _ => StartupRequestCode::Normal, }; buf.advance(len as usize); - Ok(Some(FeMessage::StartupMessage(FeStartupMessage{version, kind}))) + Ok(Some(FeMessage::StartupMessage(FeStartupMessage { + version, + kind, + }))) } } @@ -139,7 +142,7 @@ struct Buffer { #[derive(Debug)] struct FeQueryMessage { - body: Bytes + body: Bytes, } impl FeMessage { @@ -171,7 +174,9 @@ impl FeMessage { body.advance(5); match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage{body:body.freeze()}))), + b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { + body: body.freeze(), + }))), b'X' => Ok(Some(FeMessage::Terminate)), b'd' => { let smgr_tag = body.get_u8(); @@ -197,15 +202,13 @@ impl FeMessage { _ => Err(io::Error::new( io::ErrorKind::InvalidInput, format!("unknown smgr message tag: {},'{:?}'", smgr_tag, buf), - )) + )), } - }, - tag => { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("unknown message tag: {},'{:?}'", tag, buf), - )) } + tag => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown message tag: {},'{:?}'", tag, buf), + )), } } } @@ -213,13 +216,15 @@ impl FeMessage { /////////////////////////////////////////////////////////////////////////////// pub fn thread_main(conf: PageServerConf) { - // Create a new thread pool // // FIXME: keep it single-threaded for now, make it easier to debug with gdb, // and we're not concerned with performance yet. //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap(); + let runtime = runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); info!("Starting page server on {}", conf.listen_addr); @@ -249,13 +254,12 @@ struct Connection { } impl Connection { - pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection { Connection { stream: BufWriter::new(socket), buffer: BytesMut::with_capacity(10 * 1024), init_done: false, - conf: conf + conf: conf, } } @@ -272,24 +276,24 @@ impl Connection { if self.buffer.is_empty() { return Ok(None); } else { - return Err(io::Error::new(io::ErrorKind::Other,"connection reset by peer")); + return Err(io::Error::new( + io::ErrorKind::Other, + "connection reset by peer", + )); } } } } fn parse_message(&mut self) -> Result> { - if !self.init_done { FeStartupMessage::parse(&mut self.buffer) } else { FeMessage::parse(&mut self.buffer) } - } async fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<()> { - match message { BeMessage::AuthenticationOk => { self.stream.write_u8(b'R').await?; @@ -308,16 +312,18 @@ impl Connection { let mut b = Bytes::from("data\0"); self.stream.write_u8(b'T').await?; - self.stream.write_i32(4 + 2 + b.len() as i32 + 3*(4 + 2)).await?; + self.stream + .write_i32(4 + 2 + b.len() as i32 + 3 * (4 + 2)) + .await?; self.stream.write_i16(1).await?; self.stream.write_buf(&mut b).await?; - self.stream.write_i32(0).await?; /* table oid */ - self.stream.write_i16(0).await?; /* attnum */ - self.stream.write_i32(25).await?; /* TEXTOID */ - self.stream.write_i16(-1).await?; /* typlen */ - self.stream.write_i32(0).await?; /* typmod */ - self.stream.write_i16(0).await?; /* format code */ + self.stream.write_i32(0).await?; /* table oid */ + self.stream.write_i16(0).await?; /* attnum */ + self.stream.write_i32(25).await?; /* TEXTOID */ + self.stream.write_i16(-1).await?; /* typlen */ + self.stream.write_i32(0).await?; /* typmod */ + self.stream.write_i16(0).await?; /* format code */ } // XXX: accept some text data @@ -371,7 +377,9 @@ impl Connection { BeMessage::ZenithReadResponse(resp) => { self.stream.write_u8(b'd').await?; - self.stream.write_u32(4 + 1 + 1 + 4 + resp.page.len() as u32).await?; + self.stream + .write_u32(4 + 1 + 1 + 4 + resp.page.len() as u32) + .await?; self.stream.write_u8(102).await?; /* tag from pagestore_client.h */ self.stream.write_u8(resp.ok as u8).await?; self.stream.write_u32(resp.n_blocks).await?; @@ -388,9 +396,7 @@ impl Connection { } async fn run(&mut self) -> Result<()> { - loop { - match self.read_message().await? { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); @@ -402,16 +408,17 @@ impl Connection { self.stream.flush().await?; } StartupRequestCode::Normal => { - self.write_message_noflush(&BeMessage::AuthenticationOk).await?; + self.write_message_noflush(&BeMessage::AuthenticationOk) + .await?; self.write_message(&BeMessage::ReadyForQuery).await?; self.init_done = true; - }, - StartupRequestCode::Cancel => return Ok(()) + } + StartupRequestCode::Cancel => return Ok(()), } - }, + } Some(FeMessage::Query(m)) => { self.process_query(&m).await?; - }, + } Some(FeMessage::Terminate) => { break; } @@ -420,7 +427,7 @@ impl Connection { break; } _ => { - return Err(io::Error::new(io::ErrorKind::Other,"unexpected message")); + return Err(io::Error::new(io::ErrorKind::Other, "unexpected message")); } } } @@ -428,25 +435,21 @@ impl Connection { Ok(()) } - async fn process_query(&mut self, q : &FeQueryMessage) -> Result<()> { + async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> { trace!("got query {:?}", q.body); if q.body.starts_with(b"controlfile") { - self.handle_controlfile().await - } else if q.body.starts_with(b"pagestream ") { - let (_l,r) = q.body.split_at("pagestream ".len()); + let (_l, r) = q.body.split_at("pagestream ".len()); let mut r = r.to_vec(); r.pop(); let sysid = String::from_utf8(r).unwrap().trim().to_string(); let sysid: u64 = sysid.parse().unwrap(); // XXX self.handle_pagerequests(sysid).await - } else if q.body.starts_with(b"callmemaybe ") { - - let (_l,r) = q.body.split_at("callmemaybe ".len()); + let (_l, r) = q.body.split_at("callmemaybe ".len()); let mut r = r.to_vec(); r.pop(); let connstr = String::from_utf8(r).unwrap().trim().to_string(); @@ -455,44 +458,49 @@ impl Connection { let _walreceiver_thread = thread::Builder::new() .name("WAL receiver thread".into()) .spawn(move || { - walreceiver::thread_main(conf_copy,&connstr); + walreceiver::thread_main(conf_copy, &connstr); }) .unwrap(); // generick ack: - self.write_message_noflush(&BeMessage::RowDescription).await?; + self.write_message_noflush(&BeMessage::RowDescription) + .await?; self.write_message_noflush(&BeMessage::DataRow).await?; - self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; self.write_message(&BeMessage::ReadyForQuery).await - } else if q.body.starts_with(b"status") { - self.write_message_noflush(&BeMessage::RowDescription).await?; + self.write_message_noflush(&BeMessage::RowDescription) + .await?; self.write_message_noflush(&BeMessage::DataRow).await?; - self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; self.write_message(&BeMessage::ReadyForQuery).await - } else { - self.write_message_noflush(&BeMessage::RowDescription).await?; + self.write_message_noflush(&BeMessage::RowDescription) + .await?; self.write_message_noflush(&BeMessage::DataRow).await?; - self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; self.write_message(&BeMessage::ReadyForQuery).await } } async fn handle_controlfile(&mut self) -> Result<()> { - self.write_message_noflush(&BeMessage::RowDescription).await?; + self.write_message_noflush(&BeMessage::RowDescription) + .await?; self.write_message_noflush(&BeMessage::ControlFile).await?; - self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; self.write_message(&BeMessage::ReadyForQuery).await } async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> { - /* switch client to COPYBOTH */ self.stream.write_u8(b'W').await?; self.stream.write_i32(4 + 1 + 2).await?; - self.stream.write_u8(0).await?; /* copy_is_binary */ - self.stream.write_i16(0).await?; /* numAttributes */ + self.stream.write_u8(0).await?; /* copy_is_binary */ + self.stream.write_i16(0).await?; /* numAttributes */ self.stream.flush().await?; let pcache = page_cache::get_pagecahe(self.conf.clone(), sysid); @@ -511,7 +519,6 @@ impl Connection { match message { Some(FeMessage::ZenithExistsRequest(req)) => { - let tag = page_cache::RelTag { spcnode: req.spcnode, dbnode: req.dbnode, @@ -523,23 +530,25 @@ impl Connection { self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: exist, - n_blocks: 0 - })).await? + n_blocks: 0, + })) + .await? } Some(FeMessage::ZenithTruncRequest(_)) => { self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, - n_blocks: 0 - })).await? + n_blocks: 0, + })) + .await? } Some(FeMessage::ZenithUnlinkRequest(_)) => { self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, - n_blocks: 0 - })).await? + n_blocks: 0, + })) + .await? } Some(FeMessage::ZenithNblocksRequest(req)) => { - let tag = page_cache::RelTag { spcnode: req.spcnode, dbnode: req.dbnode, @@ -551,8 +560,9 @@ impl Connection { self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse { ok: true, - n_blocks: n_blocks - })).await? + n_blocks: n_blocks, + })) + .await? } Some(FeMessage::ZenithReadRequest(req)) => { let buf_tag = page_cache::BufferTag { @@ -560,30 +570,27 @@ impl Connection { dbnode: req.dbnode, relnode: req.relnode, forknum: req.forknum, - blknum: req.blkno + blknum: req.blkno, }; let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn) { - Ok(p) => { - BeMessage::ZenithReadResponse(ZenithReadResponse { - ok: true, - n_blocks: 0, - page: p - }) - }, + Ok(p) => BeMessage::ZenithReadResponse(ZenithReadResponse { + ok: true, + n_blocks: 0, + page: p, + }), Err(e) => { - const ZERO_PAGE:[u8; 8192] = [0; 8192]; + const ZERO_PAGE: [u8; 8192] = [0; 8192]; error!("get_page_at_lsn: {}", e); BeMessage::ZenithReadResponse(ZenithReadResponse { ok: false, n_blocks: 0, - page: Bytes::from_static(&ZERO_PAGE) + page: Bytes::from_static(&ZERO_PAGE), }) } }; self.write_message(&msg).await? - } Some(FeMessage::ZenithCreateRequest(req)) => { let tag = page_cache::RelTag { @@ -597,8 +604,9 @@ impl Connection { self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, - n_blocks: 0 - })).await? + n_blocks: 0, + })) + .await? } Some(FeMessage::ZenithExtendRequest(req)) => { let tag = page_cache::RelTag { @@ -612,14 +620,12 @@ impl Connection { self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, - n_blocks: 0 - })).await? - } - _ => { - + n_blocks: 0, + })) + .await? } + _ => {} } } - } } diff --git a/pageserver/src/restore_s3.rs b/pageserver/src/restore_s3.rs index 17ed726c81..e603f1d1f8 100644 --- a/pageserver/src/restore_s3.rs +++ b/pageserver/src/restore_s3.rs @@ -7,11 +7,11 @@ // is started, it starts streaming from that LSN. // +use bytes::{Buf, BytesMut}; +use log::*; +use regex::Regex; use std::env; use std::fmt; -use regex::Regex; -use bytes::{BytesMut, Buf}; -use log::*; use s3::bucket::Bucket; use s3::creds::Credentials; @@ -22,12 +22,12 @@ use tokio::runtime; use futures::future; -use crate::{PageServerConf, page_cache}; +use crate::{page_cache, PageServerConf}; struct Storage { region: Region, credentials: Credentials, - bucket: String + bucket: String, } pub fn restore_main(conf: &PageServerConf) { @@ -38,7 +38,9 @@ pub fn restore_main(conf: &PageServerConf) { let result = restore_chunk(conf).await; match result { - Ok(_) => { return; }, + Ok(_) => { + return; + } Err(err) => { error!("S3 error: {}", err); return; @@ -56,7 +58,6 @@ pub fn restore_main(conf: &PageServerConf) { // Load it all into the page cache. // async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { - let backend = Storage { region: Region::Custom { region: env::var("S3_REGION").unwrap().into(), @@ -67,8 +68,10 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { Some(&env::var("S3_SECRET").unwrap()), None, None, - None).unwrap(), - bucket: "zenith-testbucket".to_string() + None, + ) + .unwrap(), + bucket: "zenith-testbucket".to_string(), }; info!("Restoring from S3..."); @@ -77,7 +80,9 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?; // List out contents of directory - let results: Vec = bucket.list("relationdata/".to_string(), Some("".to_string())).await?; + let results: Vec = bucket + .list("relationdata/".to_string(), Some("".to_string())) + .await?; // TODO: get that from backup let sys_id: u64 = 42; @@ -86,7 +91,6 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { for result in results { for object in result.contents { - // Download every relation file, slurping them into memory let key = object.key; @@ -104,7 +108,9 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { slurp_futures.push(f); } - Err(e) => { warn!("unrecognized file: {} ({})", relpath, e); } + Err(e) => { + warn!("unrecognized file: {} ({})", relpath, e); + } }; } } @@ -127,29 +133,28 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> { // From pg_tablespace_d.h // // FIXME: we'll probably need these elsewhere too, move to some common location -const DEFAULTTABLESPACE_OID:u32 = 1663; -const GLOBALTABLESPACE_OID:u32 = 1664; +const DEFAULTTABLESPACE_OID: u32 = 1663; +const GLOBALTABLESPACE_OID: u32 = 1664; #[derive(Debug)] struct FilePathError { - msg: String + msg: String, } impl FilePathError { fn new(msg: &str) -> FilePathError { FilePathError { - msg: msg.to_string() + msg: msg.to_string(), } } } - impl From for FilePathError { - fn from(e: core::num::ParseIntError) -> Self { - return FilePathError { msg: format!("invalid filename: {}", e) } + return FilePathError { + msg: format!("invalid filename: {}", e), + }; } - } impl fmt::Display for FilePathError { @@ -158,7 +163,6 @@ impl fmt::Display for FilePathError { } } - fn forkname_to_forknum(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present @@ -166,7 +170,7 @@ fn forkname_to_forknum(forkname: Option<&str>) -> Result { Some("fsm") => Ok(1), Some("vm") => Ok(2), Some("init") => Ok(3), - Some(_) => Err(FilePathError::new("invalid forkname")) + Some(_) => Err(FilePathError::new("invalid forkname")), } } @@ -188,20 +192,29 @@ struct ParsedBaseImageFileName { // _. fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> { - let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?_(?P[[:xdigit:]]{8})(?P[[:xdigit:]]{8})$").unwrap(); - let caps = re.captures(fname).ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; let relnode_str = caps.name("relnode").unwrap().as_str(); let relnode = u32::from_str_radix(relnode_str, 10)?; let forkname_match = caps.name("forkname"); - let forkname = if forkname_match.is_none() { None } else { Some(forkname_match.unwrap().as_str()) }; + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; let forknum = forkname_to_forknum(forkname)?; let segno_match = caps.name("segno"); - let segno = if segno_match.is_none() { 0 } else { u32::from_str_radix(segno_match.unwrap().as_str(), 10)? }; + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?; let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?; @@ -211,7 +224,6 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> { } fn parse_rel_file_path(path: &str) -> Result { - /* * Relation data files can be in one of the following directories: * @@ -238,15 +250,20 @@ fn parse_rel_file_path(path: &str) -> Result Result Result= 8192 { - let tag = page_cache::BufferTag { spcnode: parsed.spcnode, dbnode: parsed.dbnode, relnode: parsed.relnode, forknum: parsed.forknum as u8, - blknum: blknum + blknum: blknum, }; pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); diff --git a/pageserver/src/tui.rs b/pageserver/src/tui.rs index f7f9686978..653600b82e 100644 --- a/pageserver/src/tui.rs +++ b/pageserver/src/tui.rs @@ -2,17 +2,17 @@ use crate::tui_event::{Event, Events}; use crate::tui_logger::TuiLogger; use crate::tui_logger::TuiLoggerWidget; -use std::{error::Error, io}; +use lazy_static::lazy_static; use std::sync::Arc; +use std::{error::Error, io}; use termion::{event::Key, input::MouseTerminal, raw::IntoRawMode, screen::AlternateScreen}; use tui::backend::TermionBackend; use tui::buffer::Buffer; -use tui::style::{Color, Style, Modifier}; +use tui::layout::{Constraint, Direction, Layout, Rect}; +use tui::style::{Color, Modifier, Style}; +use tui::text::{Span, Spans, Text}; +use tui::widgets::{Block, BorderType, Borders, Paragraph, Widget}; use tui::Terminal; -use tui::text::{Text, Span, Spans}; -use tui::widgets::{Widget, Block, Borders, BorderType, Paragraph}; -use tui::layout::{Layout, Direction, Constraint, Rect}; -use lazy_static::lazy_static; use slog; use slog::Drain; @@ -25,64 +25,69 @@ lazy_static! { } pub fn init_logging() -> slog_scope::GlobalLoggerGuard { - - let pageservice_drain = slog::Filter::new(PAGESERVICE_DRAIN.as_ref(), - |record: &slog::Record| { - if record.level().is_at_least(slog::Level::Debug) && record.module().starts_with("pageserver::page_service") { + let pageservice_drain = + slog::Filter::new(PAGESERVICE_DRAIN.as_ref(), |record: &slog::Record| { + if record.level().is_at_least(slog::Level::Debug) + && record.module().starts_with("pageserver::page_service") + { return true; } return false; - } - ).fuse(); + }) + .fuse(); - let walredo_drain = slog::Filter::new(WALREDO_DRAIN.as_ref(), - |record: &slog::Record| { - if record.level().is_at_least(slog::Level::Debug) && record.module().starts_with("pageserver::walredo") { + let walredo_drain = slog::Filter::new(WALREDO_DRAIN.as_ref(), |record: &slog::Record| { + if record.level().is_at_least(slog::Level::Debug) + && record.module().starts_with("pageserver::walredo") + { + return true; + } + return false; + }) + .fuse(); + + let walreceiver_drain = + slog::Filter::new(WALRECEIVER_DRAIN.as_ref(), |record: &slog::Record| { + if record.level().is_at_least(slog::Level::Debug) + && record.module().starts_with("pageserver::walreceiver") + { return true; } return false; - } - ).fuse(); + }) + .fuse(); - let walreceiver_drain = slog::Filter::new(WALRECEIVER_DRAIN.as_ref(), - |record: &slog::Record| { - if record.level().is_at_least(slog::Level::Debug) && record.module().starts_with("pageserver::walreceiver") { - return true; - } - return false; + let catchall_drain = slog::Filter::new(CATCHALL_DRAIN.as_ref(), |record: &slog::Record| { + if record.level().is_at_least(slog::Level::Info) { + return true; } - ).fuse(); - - let catchall_drain = slog::Filter::new(CATCHALL_DRAIN.as_ref(), - |record: &slog::Record| { - if record.level().is_at_least(slog::Level::Info) { - return true; - } - if record.level().is_at_least(slog::Level::Debug) && record.module().starts_with("pageserver") { - return true; - } - return false; + if record.level().is_at_least(slog::Level::Debug) + && record.module().starts_with("pageserver") + { + return true; } - ).fuse(); + return false; + }) + .fuse(); let drain = pageservice_drain; let drain = slog::Duplicate::new(drain, walreceiver_drain).fuse(); let drain = slog::Duplicate::new(drain, walredo_drain).fuse(); let drain = slog::Duplicate::new(drain, catchall_drain).fuse(); let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse(); - let drain = slog::Filter::new(drain, - |record: &slog::Record| { + let drain = slog::Filter::new(drain, |record: &slog::Record| { + if record.level().is_at_least(slog::Level::Info) { + return true; + } + if record.level().is_at_least(slog::Level::Debug) + && record.module().starts_with("pageserver") + { + return true; + } - if record.level().is_at_least(slog::Level::Info) { - return true; - } - if record.level().is_at_least(slog::Level::Debug) && record.module().starts_with("pageserver") { - return true; - } - - return false; - } - ).fuse(); + return false; + }) + .fuse(); let logger = slog::Logger::root(drain, slog::o!()); return slog_scope::set_global_logger(logger); } @@ -143,21 +148,27 @@ pub fn ui_main<'b>() -> Result<(), Box> { let top_top_right_chunk = c[0]; let top_bot_right_chunk = c[1]; - f.render_widget(LogWidget::new(PAGESERVICE_DRAIN.as_ref(),"Page Service"), - top_top_left_chunk); + f.render_widget( + LogWidget::new(PAGESERVICE_DRAIN.as_ref(), "Page Service"), + top_top_left_chunk, + ); - f.render_widget(LogWidget::new(WALREDO_DRAIN.as_ref(), "WAL Redo"), - top_bot_left_chunk); + f.render_widget( + LogWidget::new(WALREDO_DRAIN.as_ref(), "WAL Redo"), + top_bot_left_chunk, + ); - f.render_widget(LogWidget::new(WALRECEIVER_DRAIN.as_ref(), "WAL Receiver"), - top_top_right_chunk); + f.render_widget( + LogWidget::new(WALRECEIVER_DRAIN.as_ref(), "WAL Receiver"), + top_top_right_chunk, + ); f.render_widget(MetricsWidget {}, top_bot_right_chunk); - f.render_widget(LogWidget::new(CATCHALL_DRAIN.as_ref(), "All Log") - .show_module(true), - bottom_chunk); - + f.render_widget( + LogWidget::new(CATCHALL_DRAIN.as_ref(), "All Log").show_module(true), + bottom_chunk, + ); })?; // If ther user presses 'q', quit. @@ -177,7 +188,6 @@ pub fn ui_main<'b>() -> Result<(), Box> { Ok(()) } - struct LogWidget<'a> { logger: &'a TuiLogger, title: &'a str, @@ -186,7 +196,11 @@ struct LogWidget<'a> { impl<'a> LogWidget<'a> { fn new(logger: &'a TuiLogger, title: &'a str) -> LogWidget<'a> { - LogWidget { logger, title, show_module: false } + LogWidget { + logger, + title, + show_module: false, + } } fn show_module(mut self, b: bool) -> LogWidget<'a> { @@ -196,14 +210,14 @@ impl<'a> LogWidget<'a> { } impl<'a> Widget for LogWidget<'a> { - fn render(self, area: Rect, buf: &mut Buffer) { - let w = TuiLoggerWidget::default(self.logger) - .block(Block::default() - .borders(Borders::ALL) - .title(self.title) - .border_type(BorderType::Rounded)) + .block( + Block::default() + .borders(Borders::ALL) + .title(self.title) + .border_type(BorderType::Rounded), + ) .show_module(true) .style_error(Style::default().fg(Color::Red)) .style_warn(Style::default().fg(Color::Yellow)) @@ -213,14 +227,16 @@ impl<'a> Widget for LogWidget<'a> { } // Render a widget to show some metrics -struct MetricsWidget { -} +struct MetricsWidget {} fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> { Spans::from(vec![ Span::styled(format!("{:<20}", title), Style::default()), Span::raw(": "), - Span::styled(value.to_string(), Style::default().add_modifier(Modifier::BOLD)), + Span::styled( + value.to_string(), + Style::default().add_modifier(Modifier::BOLD), + ), ]) } @@ -235,21 +251,16 @@ fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> { // FIXME: We really should define a datatype for LSNs, with Display trait and // helper functions. There's one in tokio-postgres, but I don't think we want // to rely on that. -fn format_lsn(lsn: u64) -> String -{ - return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff) +fn format_lsn(lsn: u64) -> String { + return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff); } impl tui::widgets::Widget for MetricsWidget { - - fn render(self, area: Rect, buf: &mut Buffer) { - - let block = Block::default() - .borders(Borders::ALL) - .title("Page Cache Metrics") - .border_type(BorderType::Rounded); + .borders(Borders::ALL) + .title("Page Cache Metrics") + .border_type(BorderType::Rounded); let inner_area = block.inner(area); block.render(area, buf); @@ -257,17 +268,30 @@ impl tui::widgets::Widget for MetricsWidget { let mut lines: Vec = Vec::new(); let page_cache_stats = crate::page_cache::get_stats(); - let lsnrange = format!("{} - {}", - format_lsn(page_cache_stats.first_valid_lsn), - format_lsn(page_cache_stats.last_valid_lsn)); - let last_valid_recordlsn_str = - format_lsn(page_cache_stats.last_record_lsn); + let lsnrange = format!( + "{} - {}", + format_lsn(page_cache_stats.first_valid_lsn), + format_lsn(page_cache_stats.last_valid_lsn) + ); + let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn); lines.push(get_metric_str("Valid LSN range", &lsnrange)); lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str)); - lines.push(get_metric_u64("# of cache entries", page_cache_stats.num_entries)); - lines.push(get_metric_u64("# of page images", page_cache_stats.num_page_images)); - lines.push(get_metric_u64("# of WAL records", page_cache_stats.num_wal_records)); - lines.push(get_metric_u64("# of GetPage@LSN calls", page_cache_stats.num_getpage_requests)); + lines.push(get_metric_u64( + "# of cache entries", + page_cache_stats.num_entries, + )); + lines.push(get_metric_u64( + "# of page images", + page_cache_stats.num_page_images, + )); + lines.push(get_metric_u64( + "# of WAL records", + page_cache_stats.num_wal_records, + )); + lines.push(get_metric_u64( + "# of GetPage@LSN calls", + page_cache_stats.num_getpage_requests, + )); let text = Text::from(lines); diff --git a/pageserver/src/tui_logger.rs b/pageserver/src/tui_logger.rs index c1a563cf90..0b49dcc388 100644 --- a/pageserver/src/tui_logger.rs +++ b/pageserver/src/tui_logger.rs @@ -8,19 +8,19 @@ // Also, I didn't do any of the "hot log" stuff that gin66's implementation had, you can use an // AsyncDrain to buffer and handle overflow if desired. // -use std::collections::VecDeque; -use std::sync::Mutex; -use std::time::SystemTime; use chrono::offset::Local; use chrono::DateTime; use slog; -use slog::{Drain, OwnedKVList, Record, Level}; +use slog::{Drain, Level, OwnedKVList, Record}; use slog_async::AsyncRecord; +use std::collections::VecDeque; +use std::sync::Mutex; +use std::time::SystemTime; use tui::buffer::Buffer; -use tui::layout::{Rect}; -use tui::style::{Style, Modifier}; +use tui::layout::Rect; +use tui::style::{Modifier, Style}; use tui::text::{Span, Spans}; -use tui::widgets::{Block, Widget, Paragraph, Wrap}; +use tui::widgets::{Block, Paragraph, Widget, Wrap}; // Size of the log ring buffer, in # of records static BUFFER_SIZE: usize = 1000; @@ -41,11 +41,7 @@ impl Drain for TuiLogger { type Ok = (); type Err = slog::Error; - fn log(&self, - record: &Record, - values: &OwnedKVList) - -> Result { - + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { let mut events = self.events.lock().unwrap(); let now = SystemTime::now(); @@ -129,7 +125,6 @@ impl<'b> TuiLoggerWidget<'b> { } impl<'b> Widget for TuiLoggerWidget<'b> { fn render(mut self, area: Rect, buf: &mut Buffer) { - buf.set_style(area, self.style); let list_area = match self.block.take() { Some(b) => { @@ -156,7 +151,6 @@ impl<'b> Widget for TuiLoggerWidget<'b> { let events = self.logger.events.lock().unwrap(); for evt in events.iter() { - let (timestamp, rec) = evt; rec.as_record_values(|rec, _kwlist| { @@ -200,7 +194,7 @@ impl<'b> Widget for TuiLoggerWidget<'b> { let text = tui::text::Text::from(lines); Paragraph::new(text) - .wrap(Wrap { trim: true } ) + .wrap(Wrap { trim: true }) .render(list_area, buf); } } diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index e44af1de7f..1f1a5dfc99 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -13,45 +13,41 @@ use log::*; const XLOG_BLCKSZ: u32 = 8192; // FIXME: this is configurable in PostgreSQL, 16 MB is the default -const WAL_SEGMENT_SIZE: u64 = 16*1024*1024; - +const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024; // From PostgreSQL headers #[repr(C)] #[derive(Debug)] -struct XLogPageHeaderData -{ - xlp_magic: u16, /* magic value for correctness checks */ - xlp_info: u16, /* flag bits, see below */ - xlp_tli: u32, /* TimeLineID of first record on page */ - xlp_pageaddr: u64, /* XLOG address of this page */ - xlp_rem_len: u32, /* total len of remaining data for record */ +struct XLogPageHeaderData { + xlp_magic: u16, /* magic value for correctness checks */ + xlp_info: u16, /* flag bits, see below */ + xlp_tli: u32, /* TimeLineID of first record on page */ + xlp_pageaddr: u64, /* XLOG address of this page */ + xlp_rem_len: u32, /* total len of remaining data for record */ } // FIXME: this assumes MAXIMUM_ALIGNOF 8. There are 4 padding bytes at end #[allow(non_upper_case_globals)] -const SizeOfXLogShortPHD: usize = 2+2+4+8+4 + 4; +const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4; #[repr(C)] #[derive(Debug)] -struct XLogLongPageHeaderData -{ - std: XLogPageHeaderData, /* standard header fields */ - xlp_sysid: u64, /* system identifier from pg_control */ - xlp_seg_size: u32, /* just as a cross-check */ - xlp_xlog_blcksz: u32, /* just as a cross-check */ +struct XLogLongPageHeaderData { + std: XLogPageHeaderData, /* standard header fields */ + xlp_sysid: u64, /* system identifier from pg_control */ + xlp_seg_size: u32, /* just as a cross-check */ + xlp_xlog_blcksz: u32, /* just as a cross-check */ } // FIXME: this assumes MAXIMUM_ALIGNOF 8. #[allow(non_upper_case_globals)] -const SizeOfXLogLongPHD: usize = (2+2+4+8+4) + 4 + 8 + 4 + 4; +const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4; pub struct WalStreamDecoder { - lsn: u64, - startlsn: u64, // LSN where this record starts + startlsn: u64, // LSN where this record starts contlen: u32, padlen: u32, @@ -65,7 +61,6 @@ pub struct WalStreamDecoder { // FIXME: This isn't a proper rust stream // impl WalStreamDecoder { - pub fn new(lsn: u64) -> WalStreamDecoder { WalStreamDecoder { lsn: lsn, @@ -86,7 +81,6 @@ impl WalStreamDecoder { // Returns a tuple: // (end LSN, record) pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> { - loop { // parse and verify page boundaries as we go if self.lsn % WAL_SEGMENT_SIZE == 0 { @@ -115,9 +109,7 @@ impl WalStreamDecoder { // TODO: verify the fields in the header continue; - } - else if self.padlen > 0 - { + } else if self.padlen > 0 { if self.inputbuf.remaining() < self.padlen as usize { return None; } @@ -126,9 +118,7 @@ impl WalStreamDecoder { self.inputbuf.advance(self.padlen as usize); self.lsn += self.padlen as u64; self.padlen = 0; - } - else if self.contlen == 0 - { + } else if self.contlen == 0 { // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { @@ -139,8 +129,12 @@ impl WalStreamDecoder { self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - error!("invalid xl_tot_len {} at {:X}/{:X}", xl_tot_len, - self.lsn >> 32, self.lsn & 0xffffffff); + error!( + "invalid xl_tot_len {} at {:X}/{:X}", + xl_tot_len, + self.lsn >> 32, + self.lsn & 0xffffffff + ); panic!(); } self.lsn += 4; @@ -151,11 +145,9 @@ impl WalStreamDecoder { self.contlen = xl_tot_len - 4; continue; - } - else - { + } else { // we're continuing a record, possibly from previous page. - let pageleft:u32 = XLOG_BLCKSZ - (self.lsn % (XLOG_BLCKSZ as u64)) as u32; + let pageleft: u32 = XLOG_BLCKSZ - (self.lsn % (XLOG_BLCKSZ as u64)) as u32; // read the rest of the record, or as much as fits on this page. let n = min(self.contlen, pageleft) as usize; @@ -176,8 +168,11 @@ impl WalStreamDecoder { // XLOG_SWITCH records are special. If we see one, we need to skip // to the next WAL segment. if is_xlog_switch_record(&recordbuf) { - trace!("saw xlog switch record at {:X}/{:X}", - (self.lsn >> 32), self.lsn & 0xffffffff); + trace!( + "saw xlog switch record at {:X}/{:X}", + (self.lsn >> 32), + self.lsn & 0xffffffff + ); self.padlen = (WAL_SEGMENT_SIZE - (self.lsn % WAL_SEGMENT_SIZE)) as u32; } @@ -195,24 +190,21 @@ impl WalStreamDecoder { // deal with continuation records - // deal with xlog_switch records } - #[allow(non_snake_case)] fn decode_XLogPageHeaderData(&mut self) -> XLogPageHeaderData { - let buf = &mut self.inputbuf; // FIXME: Assume little-endian - let hdr : XLogPageHeaderData = XLogPageHeaderData { + let hdr: XLogPageHeaderData = XLogPageHeaderData { xlp_magic: buf.get_u16_le(), xlp_info: buf.get_u16_le(), xlp_tli: buf.get_u32_le(), xlp_pageaddr: buf.get_u64_le(), - xlp_rem_len: buf.get_u32_le() + xlp_rem_len: buf.get_u32_le(), }; // 4 bytes of padding, on 64-bit systems buf.advance(4); @@ -225,8 +217,7 @@ impl WalStreamDecoder { #[allow(non_snake_case)] fn decode_XLogLongPageHeaderData(&mut self) -> XLogLongPageHeaderData { - - let hdr : XLogLongPageHeaderData = XLogLongPageHeaderData { + let hdr: XLogLongPageHeaderData = XLogLongPageHeaderData { std: self.decode_XLogPageHeaderData(), xlp_sysid: self.inputbuf.get_u64_le(), xlp_seg_size: self.inputbuf.get_u32_le(), @@ -238,30 +229,29 @@ impl WalStreamDecoder { } // FIXME: -const BLCKSZ:u16 = 8192; +const BLCKSZ: u16 = 8192; // // Constants from xlogrecord.h // -const XLR_MAX_BLOCK_ID:u8 = 32; +const XLR_MAX_BLOCK_ID: u8 = 32; -const XLR_BLOCK_ID_DATA_SHORT:u8 = 255; -const XLR_BLOCK_ID_DATA_LONG:u8 = 254; -const XLR_BLOCK_ID_ORIGIN:u8 = 253; -const XLR_BLOCK_ID_TOPLEVEL_XID:u8 = 252; +const XLR_BLOCK_ID_DATA_SHORT: u8 = 255; +const XLR_BLOCK_ID_DATA_LONG: u8 = 254; +const XLR_BLOCK_ID_ORIGIN: u8 = 253; +const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252; -const BKPBLOCK_FORK_MASK:u8 = 0x0F; -const _BKPBLOCK_FLAG_MASK:u8 = 0xF0; -const BKPBLOCK_HAS_IMAGE:u8 = 0x10; /* block data is an XLogRecordBlockImage */ -const BKPBLOCK_HAS_DATA:u8 = 0x20; -const BKPBLOCK_WILL_INIT:u8 = 0x40; /* redo will re-init the page */ -const BKPBLOCK_SAME_REL:u8 = 0x80; /* RelFileNode omitted, same as previous */ +const BKPBLOCK_FORK_MASK: u8 = 0x0F; +const _BKPBLOCK_FLAG_MASK: u8 = 0xF0; +const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */ +const BKPBLOCK_HAS_DATA: u8 = 0x20; +const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */ +const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */ /* Information stored in bimg_info */ -const BKPIMAGE_HAS_HOLE:u8 = 0x01; /* page image has "hole" */ -const BKPIMAGE_IS_COMPRESSED:u8 = 0x02; /* page image is compressed */ -const BKPIMAGE_APPLY:u8 = 0x04; /* page image should be restored during replay */ - +const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ +const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub struct DecodedBkpBlock { /* Is this block ref in use? */ @@ -278,8 +268,8 @@ pub struct DecodedBkpBlock { flags: u8, /* Information on full-page image, if any */ - has_image: bool, /* has image, even for consistency checking */ - pub apply_image: bool, /* has image that should be restored */ + has_image: bool, /* has image, even for consistency checking */ + pub apply_image: bool, /* has image that should be restored */ pub will_init: bool, //char *bkp_image; hole_offset: u16, @@ -290,22 +280,22 @@ pub struct DecodedBkpBlock { /* Buffer holding the rmgr-specific data associated with this block */ has_data: bool, //char *data; - data_len:u16, + data_len: u16, } #[allow(non_upper_case_globals)] -const SizeOfXLogRecord:u32 = 24; +const SizeOfXLogRecord: u32 = 24; pub struct DecodedWALRecord { - pub lsn: u64, // LSN at the *end* of the record - pub record: Bytes, // raw XLogRecord + pub lsn: u64, // LSN at the *end* of the record + pub record: Bytes, // raw XLogRecord - pub blocks: Vec + pub blocks: Vec, } // From pg_control.h and rmgrlist.h -const XLOG_SWITCH:u8 = 0x40; -const RM_XLOG_ID:u8 = 0; +const XLOG_SWITCH: u8 = 0x40; +const RM_XLOG_ID: u8 = 0; // Is this record an XLOG_SWITCH record? They need some special processing, // so we need to check for that before the rest of the parsing. @@ -320,7 +310,7 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool { let _xl_prev = buf.get_u64_le(); let xl_info = buf.get_u8(); let xl_rmid = buf.get_u8(); - buf.advance(2); // 2 bytes of padding + buf.advance(2); // 2 bytes of padding let _xl_crc = buf.get_u32_le(); return xl_info == XLOG_SWITCH && xl_rmid == RM_XLOG_ID; @@ -330,8 +320,12 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool { // Routines to decode a WAL record and figure out which blocks are modified // pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { - - trace!("decoding record with LSN {:08X}/{:08X} ({} bytes)", lsn >> 32, lsn & 0xffff_ffff, rec.remaining()); + trace!( + "decoding record with LSN {:08X}/{:08X} ({} bytes)", + lsn >> 32, + lsn & 0xffff_ffff, + rec.remaining() + ); let mut buf = rec.clone(); @@ -341,7 +335,7 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { let _xl_prev = buf.get_u64_le(); let _xl_info = buf.get_u8(); let _xl_rmid = buf.get_u8(); - buf.advance(2); // 2 bytes of padding + buf.advance(2); // 2 bytes of padding let _xl_crc = buf.get_u32_le(); let remaining = xl_tot_len - SizeOfXLogRecord; @@ -365,31 +359,31 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { match block_id { XLR_BLOCK_ID_DATA_SHORT => { - /* XLogRecordDataHeaderShort */ - let main_data_len = buf.get_u8() as u32; + /* XLogRecordDataHeaderShort */ + let main_data_len = buf.get_u8() as u32; datatotal += main_data_len; } XLR_BLOCK_ID_DATA_LONG => { - /* XLogRecordDataHeaderShort */ - let main_data_len = buf.get_u32(); + /* XLogRecordDataHeaderShort */ + let main_data_len = buf.get_u32(); datatotal += main_data_len; } - XLR_BLOCK_ID_ORIGIN => { + XLR_BLOCK_ID_ORIGIN => { // RepOriginId is uint16 buf.advance(2); } - XLR_BLOCK_ID_TOPLEVEL_XID => { + XLR_BLOCK_ID_TOPLEVEL_XID => { // TransactionId is uint32 buf.advance(4); } - 0 ..= XLR_MAX_BLOCK_ID => { - /* XLogRecordBlockHeader */ + 0..=XLR_MAX_BLOCK_ID => { + /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock { rnode_spcnode: 0, rnode_dbnode: 0, @@ -407,168 +401,157 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { bimg_info: 0, has_data: false, - data_len: 0 + data_len: 0, }; let fork_flags: u8; - if block_id <= max_block_id { + if block_id <= max_block_id { // TODO - //report_invalid_record(state, - // "out-of-order block_id %u at %X/%X", - // block_id, - // (uint32) (state->ReadRecPtr >> 32), - // (uint32) state->ReadRecPtr); - // goto err; - } - max_block_id = block_id; + //report_invalid_record(state, + // "out-of-order block_id %u at %X/%X", + // block_id, + // (uint32) (state->ReadRecPtr >> 32), + // (uint32) state->ReadRecPtr); + // goto err; + } + max_block_id = block_id; fork_flags = buf.get_u8(); blk.forknum = fork_flags & BKPBLOCK_FORK_MASK; blk.flags = fork_flags; - blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0; - blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0; - blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0; + blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0; + blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0; + blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0; blk.data_len = buf.get_u16_le(); - /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ + /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ // TODO /* - if (blk->has_data && blk->data_len == 0) - { - report_invalid_record(state, - "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - } - if (!blk->has_data && blk->data_len != 0) - { - report_invalid_record(state, - "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", - (unsigned int) blk->data_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - } - */ - datatotal += blk.data_len as u32; - - if blk.has_image { + if (blk->has_data && blk->data_len == 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + if (!blk->has_data && blk->data_len != 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + (unsigned int) blk->data_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + */ + datatotal += blk.data_len as u32; + if blk.has_image { blk.bimg_len = buf.get_u16_le(); blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & BKPIMAGE_APPLY) != 0; + blk.apply_image = (blk.bimg_info & BKPIMAGE_APPLY) != 0; - if blk.bimg_info & BKPIMAGE_IS_COMPRESSED != 0 - { - if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 { - blk.hole_length = buf.get_u16_le(); - } else { - blk.hole_length = 0; + if blk.bimg_info & BKPIMAGE_IS_COMPRESSED != 0 { + if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 { + blk.hole_length = buf.get_u16_le(); + } else { + blk.hole_length = 0; } - } - else { - blk.hole_length = BLCKSZ - blk.bimg_len; + } else { + blk.hole_length = BLCKSZ - blk.bimg_len; } - datatotal += blk.bimg_len as u32; + datatotal += blk.bimg_len as u32; - /* - * cross-check that hole_offset > 0, hole_length > 0 and - * bimg_len < BLCKSZ if the HAS_HOLE flag is set. - */ - if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 && - (blk.hole_offset == 0 || - blk.hole_length == 0 || - blk.bimg_len == BLCKSZ) - { + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) + { // TODO /* - report_invalid_record(state, - "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, - (unsigned int) blk->bimg_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } - /* - * cross-check that hole_offset == 0 and hole_length == 0 if - * the HAS_HOLE flag is not set. - */ - if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0 && - (blk.hole_offset != 0 || blk.hole_length != 0) - { + /* + * cross-check that hole_offset == 0 and hole_length == 0 if + * the HAS_HOLE flag is not set. + */ + if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0 + && (blk.hole_offset != 0 || blk.hole_length != 0) + { // TODO /* - report_invalid_record(state, - "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } - /* - * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED - * flag is set. - */ - if (blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0) && - blk.bimg_len == BLCKSZ - { + /* + * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED + * flag is set. + */ + if (blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0) && blk.bimg_len == BLCKSZ { // TODO /* - report_invalid_record(state, - "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", - (unsigned int) blk->bimg_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } + report_invalid_record(state, + "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } - /* - * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor - * IS_COMPRESSED flag is set. - */ - if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0 && - blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0 && - blk.bimg_len != BLCKSZ - { + /* + * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor + * IS_COMPRESSED flag is set. + */ + if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0 + && blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0 + && blk.bimg_len != BLCKSZ + { // TODO /* - report_invalid_record(state, - "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", - (unsigned int) blk->data_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } + report_invalid_record(state, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } } - if fork_flags & BKPBLOCK_SAME_REL == 0 - { - rnode_spcnode = buf.get_u32_le(); + if fork_flags & BKPBLOCK_SAME_REL == 0 { + rnode_spcnode = buf.get_u32_le(); rnode_dbnode = buf.get_u32_le(); rnode_relnode = buf.get_u32_le(); - //rnode = &blk->rnode; + //rnode = &blk->rnode; got_rnode = true; - } - else - { - if !got_rnode - { + } else { + if !got_rnode { // TODO /* - report_invalid_record(state, - "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } + report_invalid_record(state, + "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } - //blk->rnode = *rnode; - } + //blk->rnode = *rnode; + } blk.rnode_spcnode = rnode_spcnode; blk.rnode_dbnode = rnode_dbnode; blk.rnode_relnode = rnode_relnode; @@ -601,6 +584,6 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { return DecodedWALRecord { lsn: lsn, record: rec, - blocks: blocks - } + blocks: blocks, + }; } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 814721a541..54b57aaa11 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -7,23 +7,22 @@ // use log::*; -use tokio_stream::StreamExt; use tokio::runtime; use tokio::time::{sleep, Duration}; +use tokio_stream::StreamExt; -use crate::waldecoder::WalStreamDecoder; use crate::page_cache; use crate::page_cache::BufferTag; +use crate::waldecoder::WalStreamDecoder; use crate::PageServerConf; -use tokio_postgres::{connect_replication, NoTls, Error, ReplicationMode}; use postgres_protocol::message::backend::ReplicationMessage; +use tokio_postgres::{connect_replication, Error, NoTls, ReplicationMode}; // // This is the entry point for the WAL receiver thread. // pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) { - info!("WAL receiver thread started: '{}'", wal_producer_connstr); let runtime = runtime::Builder::new_current_thread() @@ -31,26 +30,32 @@ pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) { .build() .unwrap(); - runtime.block_on( async { + runtime.block_on(async { loop { let _res = walreceiver_main(conf.clone(), wal_producer_connstr).await; // TODO: print/log the error - info!("WAL streaming connection failed, retrying in 1 second...: {:?}", _res); + info!( + "WAL streaming connection failed, retrying in 1 second...: {:?}", + _res + ); sleep(Duration::from_secs(1)).await; } }); } -async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) -> Result<(), Error> { - +async fn walreceiver_main( + conf: PageServerConf, + wal_producer_connstr: &String, +) -> Result<(), Error> { // Connect to the database in replication mode. debug!("connecting to {}...", wal_producer_connstr); let (mut rclient, connection) = connect_replication( wal_producer_connstr.as_str(), NoTls, - ReplicationMode::Physical - ).await?; + ReplicationMode::Physical, + ) + .await?; debug!("connected!"); // The connection object performs the actual communication with the database, @@ -65,7 +70,7 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - let end_of_wal = u64::from(identify_system.xlogpos()); let mut caught_up = false; - let sysid : u64 = identify_system.systemid().parse().unwrap(); + let sysid: u64 = identify_system.systemid().parse().unwrap(); let pcache = page_cache::get_pagecahe(conf, sysid); // @@ -93,9 +98,13 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - startpoint += 8 - (startpoint % 8); } } - debug!("starting replication from {:X}/{:X}, server is at {:X}/{:X}...", - (startpoint >> 32), (startpoint & 0xffffffff), - (end_of_wal >> 32), (end_of_wal & 0xffffffff)); + debug!( + "starting replication from {:X}/{:X}, server is at {:X}/{:X}...", + (startpoint >> 32), + (startpoint & 0xffffffff), + (end_of_wal >> 32), + (end_of_wal & 0xffffffff) + ); let startpoint = tokio_postgres::types::Lsn::from(startpoint); let mut physical_stream = rclient .start_physical_replication(None, startpoint, None) @@ -105,23 +114,26 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - while let Some(replication_message) = physical_stream.next().await { match replication_message? { ReplicationMessage::XLogData(xlog_data) => { - // Pass the WAL data to the decoder, and see if we can decode // more records as a result. let data = xlog_data.data(); let startlsn = xlog_data.wal_start(); let endlsn = startlsn + data.len() as u64; - trace!("received XLogData between {:X}/{:X} and {:X}/{:X}", - (startlsn >> 32), (startlsn & 0xffffffff), - (endlsn >> 32), (endlsn & 0xffffffff)); + trace!( + "received XLogData between {:X}/{:X} and {:X}/{:X}", + (startlsn >> 32), + (startlsn & 0xffffffff), + (endlsn >> 32), + (endlsn & 0xffffffff) + ); waldecoder.feed_bytes(data); loop { if let Some((lsn, recdata)) = waldecoder.poll_decode() { - - let decoded = crate::waldecoder::decode_wal_record(startlsn, recdata.clone()); + let decoded = + crate::waldecoder::decode_wal_record(startlsn, recdata.clone()); // Put the WAL record to the page cache. We make a separate copy of // it for every block it modifies. (The actual WAL record is kept in @@ -133,13 +145,13 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum as u8, - blknum: blk.blkno + blknum: blk.blkno, }; let rec = page_cache::WALRecord { lsn: lsn, will_init: blk.will_init || blk.apply_image, - rec: recdata.clone() + rec: recdata.clone(), }; pcache.put_wal_record(tag, rec); @@ -148,7 +160,6 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - // Now that this record has been handled, let the page cache know that // it is up-to-date to this LSN pcache.advance_last_valid_lsn(lsn); - } else { break; } @@ -163,7 +174,11 @@ async fn walreceiver_main(conf: PageServerConf, wal_producer_connstr: &String) - pcache.advance_last_valid_lsn(endlsn); if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {:X}/{:X}", (endlsn >> 32), (endlsn & 0xffffffff)); + info!( + "caught up at LSN {:X}/{:X}", + (endlsn >> 32), + (endlsn & 0xffffffff) + ); caught_up = true; } } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 5f11c25a9a..77f69e8fcb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -14,48 +14,49 @@ // TODO: Even though the postgres code runs in a separate process, // it's not a secure sandbox. // -use tokio::runtime::Runtime; -use tokio::process::{Command, Child, ChildStdin, ChildStdout}; -use std::{path::PathBuf, process::Stdio}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::io::AsyncBufReadExt; -use tokio::time::timeout; -use std::io::Error; -use std::cell::RefCell; -use std::assert; -use std::sync::{Arc}; -use std::fs; use log::*; -use std::time::Instant; +use std::assert; +use std::cell::RefCell; +use std::fs; +use std::io::Error; +use std::sync::Arc; use std::time::Duration; +use std::time::Instant; +use std::{path::PathBuf, process::Stdio}; +use tokio::io::AsyncBufReadExt; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::process::{Child, ChildStdin, ChildStdout, Command}; +use tokio::runtime::Runtime; +use tokio::time::timeout; -use bytes::{Bytes, BytesMut, BufMut}; +use bytes::{BufMut, Bytes, BytesMut}; -use crate::{PageServerConf, page_cache::BufferTag}; +use crate::page_cache; use crate::page_cache::CacheEntry; use crate::page_cache::WALRecord; -use crate::page_cache; +use crate::{page_cache::BufferTag, PageServerConf}; static TIMEOUT: Duration = Duration::from_secs(20); // // Main entry point for the WAL applicator thread. // -pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) -{ +pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { info!("WAL redo thread started {}", sys_id); // We block on waiting for requests on the walredo request channel, but // use async I/O to communicate with the child process. Initialize the // runtime for the async part. - let runtime = tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap(); + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); let pcache = page_cache::get_pagecahe(conf.clone(), sys_id); // Loop forever, handling requests as they come. let walredo_channel_receiver = &pcache.walredo_receiver; loop { - let mut process: WalRedoProcess; let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id)); @@ -87,8 +88,12 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) } } -fn handle_apply_request(pcache: &page_cache::PageCache, process: &WalRedoProcess, runtime: &Runtime, entry_rc: Arc) -> Result<(), Error> -{ +fn handle_apply_request( + pcache: &page_cache::PageCache, + process: &WalRedoProcess, + runtime: &Runtime, + entry_rc: Arc, +) -> Result<(), Error> { let tag = entry_rc.key.tag; let lsn = entry_rc.key.lsn; let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref()); @@ -104,16 +109,22 @@ fn handle_apply_request(pcache: &page_cache::PageCache, process: &WalRedoProcess let result; - debug!("applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}", - nrecords, duration.as_millis(), - lsn >> 32, lsn & 0xffff_ffff); + debug!( + "applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}", + nrecords, + duration.as_millis(), + lsn >> 32, + lsn & 0xffff_ffff + ); if let Err(e) = apply_result { error!("could not apply WAL records: {}", e); result = Err(e); } else { entry.page_image = Some(apply_result.unwrap()); - pcache.num_page_images.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + pcache + .num_page_images + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); result = Ok(()); } @@ -130,7 +141,6 @@ struct WalRedoProcess { } impl WalRedoProcess { - // // Start postgres binary in special WAL redo mode. // @@ -138,22 +148,23 @@ impl WalRedoProcess { // and PG_LIB_DIR so that WalRedo would start right postgres. We may later // switch to setting same things in pageserver config file. fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result { - // Create empty data directory for wal-redo postgres deleting old one. fs::remove_dir_all(datadir.to_str().unwrap()).ok(); - let initdb = runtime.block_on(Command::new("initdb") - .args(&["-D", datadir.to_str().unwrap()]) - .arg("-N") - .status() - ).expect("failed to execute initdb"); + let initdb = runtime + .block_on( + Command::new("initdb") + .args(&["-D", datadir.to_str().unwrap()]) + .arg("-N") + .status(), + ) + .expect("failed to execute initdb"); if !initdb.success() { panic!("initdb failed"); } // Start postgres itself - let mut child = - Command::new("postgres") + let mut child = Command::new("postgres") .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) @@ -162,7 +173,10 @@ impl WalRedoProcess { .spawn() .expect("postgres --wal-redo command failed to start"); - info!("launched WAL redo postgres process on {}", datadir.to_str().unwrap()); + info!( + "launched WAL redo postgres process on {}", + datadir.to_str().unwrap() + ); let stdin = child.stdin.take().expect("failed to open child's stdin"); let stderr = child.stderr.take().expect("failed to open child's stderr"); @@ -200,12 +214,16 @@ impl WalRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // - fn apply_wal_records(&self, runtime: &Runtime, tag: BufferTag, base_img: Option, records: Vec) -> Result - { + fn apply_wal_records( + &self, + runtime: &Runtime, + tag: BufferTag, + base_img: Option, + records: Vec, + ) -> Result { let mut stdin = self.stdin.borrow_mut(); let mut stdout = self.stdout.borrow_mut(); return runtime.block_on(async { - // // This async block sends all the commands to the process. // @@ -216,16 +234,26 @@ impl WalRedoProcess { let f_stdin = async { // Send base image, if any. (If the record initializes the page, previous page // version is not needed.) - timeout(TIMEOUT, stdin.write_all(&build_begin_redo_for_block_msg(tag))).await??; + timeout( + TIMEOUT, + stdin.write_all(&build_begin_redo_for_block_msg(tag)), + ) + .await??; if base_img.is_some() { - timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, base_img.unwrap()))).await??; + timeout( + TIMEOUT, + stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())), + ) + .await??; } // Send WAL records. for rec in records.iter() { let r = rec.clone(); - stdin.write_all(&build_apply_record_msg(r.lsn, r.rec)).await?; + stdin + .write_all(&build_apply_record_msg(r.lsn, r.rec)) + .await?; //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}", // r.lsn >> 32, r.lsn & 0xffff_ffff); @@ -246,7 +274,7 @@ impl WalRedoProcess { timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??; //debug!("got response for {}", tag.blknum); - Ok::<[u8;8192], Error>(buf) + Ok::<[u8; 8192], Error>(buf) }; // Kill the process. This closes its stdin, which should signal the process @@ -262,9 +290,8 @@ impl WalRedoProcess { } } -fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes -{ - let len = 4 + 5*4; +fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes { + let len = 4 + 5 * 4; let mut buf = BytesMut::with_capacity(1 + len); buf.put_u8('B' as u8); @@ -280,11 +307,10 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes return buf.freeze(); } -fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes -{ +fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes { assert!(base_img.len() == 8192); - let len = 4 + 5*4 + base_img.len(); + let len = 4 + 5 * 4 + base_img.len(); let mut buf = BytesMut::with_capacity(1 + len); buf.put_u8('P' as u8); @@ -302,7 +328,6 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes } fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes { - let len = 4 + 8 + rec.len(); let mut buf = BytesMut::with_capacity(1 + len); @@ -317,7 +342,7 @@ fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes { } fn build_get_page_msg(tag: BufferTag) -> Bytes { - let len = 4 + 5*4; + let len = 4 + 5 * 4; let mut buf = BytesMut::with_capacity(1 + len); buf.put_u8('G' as u8); diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index 37ce11f628..c4ba59ced9 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -1,20 +1,20 @@ // // Main entry point for the wal_acceptor executable // +use daemonize::Daemonize; use log::*; -use std::{fs::File, fs::OpenOptions}; use std::io; +use std::path::Path; use std::path::PathBuf; use std::thread; -use daemonize::Daemonize; -use std::path::Path; +use std::{fs::File, fs::OpenOptions}; use clap::{App, Arg}; use slog; -use slog_stdlog; -use slog_scope; use slog::Drain; +use slog_scope; +use slog_stdlog; use walkeeper::wal_service; use walkeeper::WalAcceptorConf; @@ -22,33 +22,41 @@ use walkeeper::WalAcceptorConf; fn main() -> Result<(), io::Error> { let arg_matches = App::new("Zenith wal_acceptor") .about("Store WAL stream to local file system and push it to WAL receivers") - .arg(Arg::with_name("datadir") - .short("D") - .long("dir") - .takes_value(true) - .help("Path to the page server data directory")) - .arg(Arg::with_name("listen") - .short("l") - .long("listen") - .takes_value(true) - .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)")) - .arg(Arg::with_name("daemonize") - .short("d") - .long("daemonize") - .takes_value(false) - .help("Run in the background")) - .arg(Arg::with_name("no-sync") - .short("n") - .long("no-sync") - .takes_value(false) - .help("Do not wait for changes to be written safely to disk")) + .arg( + Arg::with_name("datadir") + .short("D") + .long("dir") + .takes_value(true) + .help("Path to the page server data directory"), + ) + .arg( + Arg::with_name("listen") + .short("l") + .long("listen") + .takes_value(true) + .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"), + ) + .arg( + Arg::with_name("daemonize") + .short("d") + .long("daemonize") + .takes_value(false) + .help("Run in the background"), + ) + .arg( + Arg::with_name("no-sync") + .short("n") + .long("no-sync") + .takes_value(false) + .help("Do not wait for changes to be written safely to disk"), + ) .get_matches(); - let mut conf = WalAcceptorConf { + let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), daemonize: false, no_sync: false, - listen_addr: "127.0.0.1:5454".parse().unwrap() + listen_addr: "127.0.0.1:5454".parse().unwrap(), }; if let Some(dir) = arg_matches.value_of("datadir") { @@ -67,7 +75,7 @@ fn main() -> Result<(), io::Error> { conf.listen_addr = addr.parse().unwrap(); } - start_wal_acceptor(conf) + start_wal_acceptor(conf) } fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { @@ -82,8 +90,16 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fpritf's or backtraces. - let stdout = OpenOptions::new().create(true).append(true).open(conf.data_dir.join("wal_acceptor.log")).unwrap(); - let stderr = OpenOptions::new().create(true).append(true).open(conf.data_dir.join("wal_acceptor.log")).unwrap(); + let stdout = OpenOptions::new() + .create(true) + .append(true) + .open(conf.data_dir.join("wal_acceptor.log")) + .unwrap(); + let stderr = OpenOptions::new() + .create(true) + .append(true) + .open(conf.data_dir.join("wal_acceptor.log")) + .unwrap(); let daemonize = Daemonize::new() .pid_file(conf.data_dir.join("wal_acceptor.pid")) @@ -97,8 +113,8 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { } } - let mut threads = Vec::new(); - let wal_acceptor_thread = thread::Builder::new() + let mut threads = Vec::new(); + let wal_acceptor_thread = thread::Builder::new() .name("WAL acceptor thread".into()) .spawn(|| { // thread code @@ -114,7 +130,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { } fn init_logging(conf: &WalAcceptorConf) -> slog_scope::GlobalLoggerGuard { - if conf.daemonize { + if conf.daemonize { let log = conf.data_dir.join("wal_acceptor.log"); let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file")); let decorator = slog_term::PlainSyncDecorator::new(log_file); @@ -122,11 +138,11 @@ fn init_logging(conf: &WalAcceptorConf) -> slog_scope::GlobalLoggerGuard { let drain = std::sync::Mutex::new(drain).fuse(); let logger = slog::Logger::root(drain, slog::o!()); slog_scope::set_global_logger(logger) - } else { - let decorator = slog_term::TermDecorator::new().build(); - let drain = slog_term::FullFormat::new(decorator).build().fuse(); - let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse(); - let logger = slog::Logger::root(drain, slog::o!()); - return slog_scope::set_global_logger(logger); - } + } else { + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse(); + let logger = slog::Logger::root(drain, slog::o!()); + return slog_scope::set_global_logger(logger); + } } diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 9dd1bc35fe..28fe52ae01 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,9 +2,9 @@ use std::net::SocketAddr; use std::path::PathBuf; +mod pq_protocol; pub mod wal_service; pub mod xlog_utils; -mod pq_protocol; #[allow(dead_code)] #[derive(Debug, Clone)] diff --git a/walkeeper/src/pq_protocol.rs b/walkeeper/src/pq_protocol.rs index ce7b282eb6..1e2f7902ba 100644 --- a/walkeeper/src/pq_protocol.rs +++ b/walkeeper/src/pq_protocol.rs @@ -1,6 +1,6 @@ -use std::io; -use bytes::{Buf, Bytes, BytesMut, BufMut}; use byteorder::{BigEndian, ByteOrder}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use std::io; pub type Oid = u32; pub type Result = std::result::Result; @@ -10,14 +10,14 @@ pub enum FeMessage { StartupMessage(FeStartupMessage), Query(FeQueryMessage), Terminate, - CopyData(FeCopyData) + CopyData(FeCopyData), } #[derive(Debug)] pub struct RowDescriptor { - pub typoid : Oid, - pub typlen : i16, - pub name: &'static [u8], + pub typoid: Oid, + pub typlen: i16, + pub name: &'static [u8], } #[derive(Debug)] @@ -27,8 +27,8 @@ pub enum BeMessage<'a> { RowDescription(&'a [RowDescriptor]), DataRow(&'a [Option<&'a [u8]>]), CommandComplete(&'a [u8]), - Negotiate, - Copy + Negotiate, + Copy, } #[derive(Debug)] @@ -42,15 +42,15 @@ pub enum StartupRequestCode { Cancel, NegotiateSsl, NegotiateGss, - Normal + Normal, } impl FeStartupMessage { pub fn parse(buf: &mut BytesMut) -> Result> { const MAX_STARTUP_PACKET_LENGTH: usize = 10000; const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678; - const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679; - const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680; + const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679; + const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680; if buf.len() < 4 { return Ok(None); @@ -73,26 +73,29 @@ impl FeStartupMessage { CANCEL_REQUEST_CODE => StartupRequestCode::Cancel, NEGOTIATE_SSL_CODE => StartupRequestCode::NegotiateSsl, NEGOTIATE_GSS_CODE => StartupRequestCode::NegotiateGss, - _ => StartupRequestCode::Normal + _ => StartupRequestCode::Normal, }; buf.advance(len as usize); - Ok(Some(FeMessage::StartupMessage(FeStartupMessage{version, kind}))) + Ok(Some(FeMessage::StartupMessage(FeStartupMessage { + version, + kind, + }))) } } #[derive(Debug)] pub struct FeQueryMessage { - pub body: Bytes + pub body: Bytes, } #[derive(Debug)] pub struct FeCopyData { - pub body: Bytes + pub body: Bytes, } impl<'a> BeMessage<'a> { - pub fn write(buf : &mut BytesMut, message: &BeMessage) { + pub fn write(buf: &mut BytesMut, message: &BeMessage) { match message { BeMessage::AuthenticationOk => { buf.put_u8(b'R'); @@ -106,47 +109,51 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'I'); } - BeMessage::Negotiate => { - buf.put_u8(b'N'); - } + BeMessage::Negotiate => { + buf.put_u8(b'N'); + } BeMessage::Copy => { - buf.put_u8(b'W'); - buf.put_i32(7); - buf.put_u8(b'\0'); - buf.put_u8(b'\0'); - buf.put_u8(b'\0'); + buf.put_u8(b'W'); + buf.put_i32(7); + buf.put_u8(b'\0'); + buf.put_u8(b'\0'); + buf.put_u8(b'\0'); } BeMessage::RowDescription(rows) => { buf.put_u8(b'T'); - let total_len:u32 = rows.iter().fold(0, |acc,row| acc + row.name.len() as u32 + 3*(4 + 2)); - buf.put_u32(4 + 2 + total_len); - for row in rows.iter() { - buf.put_i16(row.name.len() as i16); - buf.put_slice(row.name); - buf.put_i32(0); /* table oid */ - buf.put_i16(0); /* attnum */ - buf.put_u32(row.typoid); - buf.put_i16(row.typlen); - buf.put_i32(-1); /* typmod */ - buf.put_i16(0); /* format code */ - } + let total_len: u32 = rows + .iter() + .fold(0, |acc, row| acc + row.name.len() as u32 + 3 * (4 + 2)); + buf.put_u32(4 + 2 + total_len); + for row in rows.iter() { + buf.put_i16(row.name.len() as i16); + buf.put_slice(row.name); + buf.put_i32(0); /* table oid */ + buf.put_i16(0); /* attnum */ + buf.put_u32(row.typoid); + buf.put_i16(row.typlen); + buf.put_i32(-1); /* typmod */ + buf.put_i16(0); /* format code */ + } } BeMessage::DataRow(vals) => { buf.put_u8(b'D'); - let total_len:usize = vals.iter().fold(0, |acc, row| acc + 4 + row.map_or(0, |s| s.len())); + let total_len: usize = vals + .iter() + .fold(0, |acc, row| acc + 4 + row.map_or(0, |s| s.len())); buf.put_u32(4 + 2 + total_len as u32); buf.put_u16(vals.len() as u16); - for val_opt in vals.iter() { - if let Some(val) = val_opt { - buf.put_u32(val.len() as u32); - buf.put_slice(val); - } else { - buf.put_i32(-1); - } - } + for val_opt in vals.iter() { + if let Some(val) = val_opt { + buf.put_u32(val.len() as u32); + buf.put_slice(val); + } else { + buf.put_i32(-1); + } + } } BeMessage::CommandComplete(cmd) => { @@ -155,7 +162,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(cmd); } } - } + } } impl FeMessage { @@ -187,15 +194,17 @@ impl FeMessage { body.advance(5); match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage{body:body.freeze()}))), - b'd' => Ok(Some(FeMessage::CopyData(FeCopyData{body:body.freeze()}))), + b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { + body: body.freeze(), + }))), + b'd' => Ok(Some(FeMessage::CopyData(FeCopyData { + body: body.freeze(), + }))), b'X' => Ok(Some(FeMessage::Terminate)), - tag => { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("unknown message tag: {},'{:?}'", tag, buf), - )) - } + tag => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown message tag: {},'{:?}'", tag, buf), + )), } } } diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index cea05f1e35..0f14cd2ae8 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -5,28 +5,28 @@ extern crate fs2; -use tokio::net::{TcpListener, TcpStream}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::runtime; -use tokio::task; use byteorder::{BigEndian, ByteOrder}; -use bytes::{Buf, Bytes, BytesMut, BufMut}; -use tokio::sync::Notify; -use std::sync::Mutex; -use std::io; -use std::fs; -use std::str; +use bytes::{Buf, BufMut, Bytes, BytesMut}; use fs2::FileExt; -use std::fs::File; -use std::io::SeekFrom; -use std::mem; +use lazy_static::lazy_static; use log::*; use regex::Regex; -use std::fs::OpenOptions; -use std::io::prelude::*; -use std::cmp::min; use std::cmp::max; -use lazy_static::lazy_static; +use std::cmp::min; +use std::fs; +use std::fs::File; +use std::fs::OpenOptions; +use std::io; +use std::io::prelude::*; +use std::io::SeekFrom; +use std::mem; +use std::str; +use std::sync::Mutex; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::runtime; +use tokio::sync::Notify; +use tokio::task; use crate::pq_protocol::*; use crate::xlog_utils::*; @@ -35,37 +35,37 @@ use crate::WalAcceptorConf; type FullTransactionId = u64; const SK_MAGIC: u32 = 0xCafeCeefu32; -const SK_FORMAT_VERSION : u32 = 1; -const SK_PROTOCOL_VERSION : u32 = 1; -const UNKNOWN_SERVER_VERSION : u32 = 0; -const END_REPLICATION_MARKER : u64 = u64::MAX; -const MAX_SEND_SIZE : usize = XLOG_BLCKSZ * 16; -const XLOG_HDR_SIZE : usize = 1+8*3; /* 'w' + startPos + walEnd + timestamp */ -const LIBPQ_HDR_SIZE : usize = 5; /* 1 byte with message type + 4 bytes length */ -const LIBPQ_MSG_SIZE_OFFS : usize = 1; -const CONTROL_FILE_NAME : &str = "safekeeper.control"; -const END_OF_STREAM : XLogRecPtr = 0; +const SK_FORMAT_VERSION: u32 = 1; +const SK_PROTOCOL_VERSION: u32 = 1; +const UNKNOWN_SERVER_VERSION: u32 = 0; +const END_REPLICATION_MARKER: u64 = u64::MAX; +const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; +const XLOG_HDR_SIZE: usize = 1 + 8 * 3; /* 'w' + startPos + walEnd + timestamp */ +const LIBPQ_HDR_SIZE: usize = 5; /* 1 byte with message type + 4 bytes length */ +const LIBPQ_MSG_SIZE_OFFS: usize = 1; +const CONTROL_FILE_NAME: &str = "safekeeper.control"; +const END_OF_STREAM: XLogRecPtr = 0; /* * Unique node identifier used by Paxos */ #[repr(C)] -#[derive(Debug,Clone,Copy,Ord,PartialOrd,PartialEq,Eq)] +#[derive(Debug, Clone, Copy, Ord, PartialOrd, PartialEq, Eq)] struct NodeId { - term : u64, - uuid : u128, + term: u64, + uuid: u128, } #[repr(C)] -#[derive(Debug,Clone,Copy)] +#[derive(Debug, Clone, Copy)] struct ServerInfo { - protocol_version : u32, /* proxy-safekeeper protocol version */ - pg_version : u32, /* Postgres server version */ - node_id : NodeId, - system_id: u64, /* Postgres system identifier */ - wal_end : XLogRecPtr, - timeline : TimeLineID, - wal_seg_size : u32, + protocol_version: u32, /* proxy-safekeeper protocol version */ + pg_version: u32, /* Postgres server version */ + node_id: NodeId, + system_id: u64, /* Postgres system identifier */ + wal_end: XLogRecPtr, + timeline: TimeLineID, + wal_seg_size: u32, } /* @@ -73,39 +73,36 @@ struct ServerInfo { */ #[repr(C)] #[derive(Debug)] -struct RequestVote -{ - node_id : NodeId, - vcl : XLogRecPtr, /* volume commit LSN */ - epoch : u64, /* new epoch when safekeeper reaches vcl */ +struct RequestVote { + node_id: NodeId, + vcl: XLogRecPtr, /* volume commit LSN */ + epoch: u64, /* new epoch when safekeeper reaches vcl */ } /* * Information of about storage node */ #[repr(C)] -#[derive(Debug,Clone,Copy)] -struct SafeKeeperInfo -{ - magic : u32, /* magic for verifying content the control file */ - format_version : u32, /* safekeeper format version */ - epoch : u64, /* safekeeper's epoch */ - server : ServerInfo, /* information about server */ - commit_lsn : XLogRecPtr, /* part of WAL acknowledged by quorum */ - flush_lsn : XLogRecPtr, /* locally flushed part of WAL */ - restart_lsn : XLogRecPtr, /* minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers */ +#[derive(Debug, Clone, Copy)] +struct SafeKeeperInfo { + magic: u32, /* magic for verifying content the control file */ + format_version: u32, /* safekeeper format version */ + epoch: u64, /* safekeeper's epoch */ + server: ServerInfo, /* information about server */ + commit_lsn: XLogRecPtr, /* part of WAL acknowledged by quorum */ + flush_lsn: XLogRecPtr, /* locally flushed part of WAL */ + restart_lsn: XLogRecPtr, /* minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers */ } /* * Hot standby feedback received from replica */ #[repr(C)] -#[derive(Debug,Copy,Clone)] -struct HotStandbyFeedback -{ - ts: TimestampTz, - xmin : FullTransactionId, - catalog_xmin : FullTransactionId, +#[derive(Debug, Copy, Clone)] +struct HotStandbyFeedback { + ts: TimestampTz, + xmin: FullTransactionId, + catalog_xmin: FullTransactionId, } /* @@ -113,13 +110,12 @@ struct HotStandbyFeedback */ #[repr(C)] #[derive(Debug)] -struct SafeKeeperRequest -{ - sender_id : NodeId, /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */ - begin_lsn : XLogRecPtr, /* start position of message in WAL */ - end_lsn : XLogRecPtr, /* end position of message in WAL */ - restart_lsn : XLogRecPtr, /* restart LSN position (minimal LSN which may be needed by proxy to perform recovery) */ - commit_lsn : XLogRecPtr, /* LSN committed by quorum of safekeepers */ +struct SafeKeeperRequest { + sender_id: NodeId, /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */ + begin_lsn: XLogRecPtr, /* start position of message in WAL */ + end_lsn: XLogRecPtr, /* end position of message in WAL */ + restart_lsn: XLogRecPtr, /* restart LSN position (minimal LSN which may be needed by proxy to perform recovery) */ + commit_lsn: XLogRecPtr, /* LSN committed by quorum of safekeepers */ } /* @@ -127,11 +123,10 @@ struct SafeKeeperRequest */ #[repr(C)] #[derive(Debug)] -struct SafeKeeperResponse -{ - epoch : u64, - flush_lsn : XLogRecPtr, - hs_feedback : HotStandbyFeedback, +struct SafeKeeperResponse { + epoch: u64, + flush_lsn: XLogRecPtr, + hs_feedback: HotStandbyFeedback, } /* @@ -139,532 +134,569 @@ struct SafeKeeperResponse */ #[derive(Debug)] struct SharedState { - commit_lsn : XLogRecPtr, /* quorum commit LSN */ - info : SafeKeeperInfo, /* information about this safekeeper */ - control_file : Option, /* opened file control file handle (needed to hold exlusive file lock */ - hs_feedback : HotStandbyFeedback /* combined hot standby feedback from all replicas */ + commit_lsn: XLogRecPtr, /* quorum commit LSN */ + info: SafeKeeperInfo, /* information about this safekeeper */ + control_file: Option, /* opened file control file handle (needed to hold exlusive file lock */ + hs_feedback: HotStandbyFeedback, /* combined hot standby feedback from all replicas */ } - /* * Static data */ #[derive(Debug)] -pub struct WalAcceptor -{ - mutex : Mutex, /* mutext for protecting shared state */ - cond : Notify, /* conditional variable used to notify wal senders */ +pub struct WalAcceptor { + mutex: Mutex, /* mutext for protecting shared state */ + cond: Notify, /* conditional variable used to notify wal senders */ } /* * Private data */ #[derive(Debug)] -struct Connection -{ - acceptor : &'static WalAcceptor, - stream : TcpStream, /* Postgres connection */ - inbuf : BytesMut, /* input buffer */ - outbuf : BytesMut, /* output buffer */ - init_done : bool, /* startup packet proceeded */ - conf : WalAcceptorConf, /* wal acceptor configuration */ - +struct Connection { + acceptor: &'static WalAcceptor, + stream: TcpStream, /* Postgres connection */ + inbuf: BytesMut, /* input buffer */ + outbuf: BytesMut, /* output buffer */ + init_done: bool, /* startup packet proceeded */ + conf: WalAcceptorConf, /* wal acceptor configuration */ } /* * Customer serializer API (TODO: use protobuf?) */ trait Serializer { - fn pack(&self, buf : &mut BytesMut); - fn unpack(buf : &mut BytesMut) -> Self; + fn pack(&self, buf: &mut BytesMut); + fn unpack(buf: &mut BytesMut) -> Self; } // // Implementations // - //Report and return IO error */ macro_rules! io_error { ($($arg:tt)*) => (error!($($arg)*); return Err(io::Error::new(io::ErrorKind::Other,format!($($arg)*)))) } - // Safe hex string parser returning proper result -fn parse_hex_str(s : &str) -> Result { - if let Ok(val) = u32::from_str_radix(s, 16) { - Ok(val as u64) - } else { - io_error!("Invalid hex number {}", s); - } +fn parse_hex_str(s: &str) -> Result { + if let Ok(val) = u32::from_str_radix(s, 16) { + Ok(val as u64) + } else { + io_error!("Invalid hex number {}", s); + } } - impl Serializer for NodeId { - fn pack(&self, buf : &mut BytesMut) { - buf.put_u128_le(self.uuid); - buf.put_u64(self.term); // use big endian to provide compatibility with memcmp - } + fn pack(&self, buf: &mut BytesMut) { + buf.put_u128_le(self.uuid); + buf.put_u64(self.term); // use big endian to provide compatibility with memcmp + } - fn unpack(buf : &mut BytesMut) -> NodeId { - NodeId { - uuid: buf.get_u128_le(), - term: buf.get_u64(), // use big endian to provide compatibility with memcmp - } - } + fn unpack(buf: &mut BytesMut) -> NodeId { + NodeId { + uuid: buf.get_u128_le(), + term: buf.get_u64(), // use big endian to provide compatibility with memcmp + } + } } impl Serializer for ServerInfo { - fn pack(&self, buf : &mut BytesMut) { - buf.put_u32_le(self.protocol_version); - buf.put_u32_le(self.pg_version); - self.node_id.pack(buf); - buf.put_u64_le(self.system_id); - buf.put_u64_le(self.wal_end); - buf.put_u32_le(self.timeline); - buf.put_u32_le(self.wal_seg_size); - } - fn unpack(buf : &mut BytesMut) -> ServerInfo { - ServerInfo { - protocol_version: buf.get_u32_le(), - pg_version: buf.get_u32_le(), - node_id: NodeId::unpack(buf), - system_id: buf.get_u64_le(), - wal_end: buf.get_u64_le(), - timeline: buf.get_u32_le(), - wal_seg_size: buf.get_u32_le(), - } - } + fn pack(&self, buf: &mut BytesMut) { + buf.put_u32_le(self.protocol_version); + buf.put_u32_le(self.pg_version); + self.node_id.pack(buf); + buf.put_u64_le(self.system_id); + buf.put_u64_le(self.wal_end); + buf.put_u32_le(self.timeline); + buf.put_u32_le(self.wal_seg_size); + } + fn unpack(buf: &mut BytesMut) -> ServerInfo { + ServerInfo { + protocol_version: buf.get_u32_le(), + pg_version: buf.get_u32_le(), + node_id: NodeId::unpack(buf), + system_id: buf.get_u64_le(), + wal_end: buf.get_u64_le(), + timeline: buf.get_u32_le(), + wal_seg_size: buf.get_u32_le(), + } + } } impl Serializer for RequestVote { - fn pack(&self, buf : &mut BytesMut) { - self.node_id.pack(buf); - buf.put_u64_le(self.vcl); - buf.put_u64_le(self.epoch); - } + fn pack(&self, buf: &mut BytesMut) { + self.node_id.pack(buf); + buf.put_u64_le(self.vcl); + buf.put_u64_le(self.epoch); + } - fn unpack(buf : &mut BytesMut) -> RequestVote { - RequestVote { - node_id: NodeId::unpack(buf), - vcl: buf.get_u64_le(), - epoch: buf.get_u64_le(), - } - } + fn unpack(buf: &mut BytesMut) -> RequestVote { + RequestVote { + node_id: NodeId::unpack(buf), + vcl: buf.get_u64_le(), + epoch: buf.get_u64_le(), + } + } } impl Serializer for SafeKeeperInfo { - fn pack(&self, buf : &mut BytesMut) { - buf.put_u32_le(self.magic); - buf.put_u32_le(self.format_version); - buf.put_u64_le(self.epoch); - self.server.pack(buf); - buf.put_u64_le(self.commit_lsn); - buf.put_u64_le(self.flush_lsn); - buf.put_u64_le(self.restart_lsn); - } - fn unpack(buf : &mut BytesMut) -> SafeKeeperInfo { - SafeKeeperInfo { - magic: buf.get_u32_le(), - format_version: buf.get_u32_le(), - epoch: buf.get_u64_le(), - server: ServerInfo::unpack(buf), - commit_lsn: buf.get_u64_le(), - flush_lsn: buf.get_u64_le(), - restart_lsn: buf.get_u64_le(), - } - } + fn pack(&self, buf: &mut BytesMut) { + buf.put_u32_le(self.magic); + buf.put_u32_le(self.format_version); + buf.put_u64_le(self.epoch); + self.server.pack(buf); + buf.put_u64_le(self.commit_lsn); + buf.put_u64_le(self.flush_lsn); + buf.put_u64_le(self.restart_lsn); + } + fn unpack(buf: &mut BytesMut) -> SafeKeeperInfo { + SafeKeeperInfo { + magic: buf.get_u32_le(), + format_version: buf.get_u32_le(), + epoch: buf.get_u64_le(), + server: ServerInfo::unpack(buf), + commit_lsn: buf.get_u64_le(), + flush_lsn: buf.get_u64_le(), + restart_lsn: buf.get_u64_le(), + } + } } impl SafeKeeperInfo { - fn new() -> SafeKeeperInfo { - SafeKeeperInfo { - magic : SK_MAGIC, - format_version : SK_FORMAT_VERSION, - epoch : 0, - server : ServerInfo { - protocol_version : SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */ - pg_version : UNKNOWN_SERVER_VERSION, /* Postgres server version */ - node_id : NodeId { term: 0, uuid: 0}, - system_id: 0, /* Postgres system identifier */ - wal_end : 0, - timeline : 0, - wal_seg_size : 0}, - commit_lsn : 0, /* part of WAL acknowledged by quorum */ - flush_lsn : 0, /* locally flushed part of WAL */ - restart_lsn : 0, /* minimal LSN which may be needed for recovery of some safekeeper */ - } - } + fn new() -> SafeKeeperInfo { + SafeKeeperInfo { + magic: SK_MAGIC, + format_version: SK_FORMAT_VERSION, + epoch: 0, + server: ServerInfo { + protocol_version: SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */ + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + node_id: NodeId { term: 0, uuid: 0 }, + system_id: 0, /* Postgres system identifier */ + wal_end: 0, + timeline: 0, + wal_seg_size: 0, + }, + commit_lsn: 0, /* part of WAL acknowledged by quorum */ + flush_lsn: 0, /* locally flushed part of WAL */ + restart_lsn: 0, /* minimal LSN which may be needed for recovery of some safekeeper */ + } + } } impl Serializer for HotStandbyFeedback { - fn pack(&self, buf : &mut BytesMut) { - buf.put_u64_le(self.ts); - buf.put_u64_le(self.xmin); - buf.put_u64_le(self.catalog_xmin); - } - fn unpack(buf : &mut BytesMut) -> HotStandbyFeedback { - HotStandbyFeedback { - ts: buf.get_u64_le(), - xmin: buf.get_u64_le(), - catalog_xmin: buf.get_u64_le(), - } - } + fn pack(&self, buf: &mut BytesMut) { + buf.put_u64_le(self.ts); + buf.put_u64_le(self.xmin); + buf.put_u64_le(self.catalog_xmin); + } + fn unpack(buf: &mut BytesMut) -> HotStandbyFeedback { + HotStandbyFeedback { + ts: buf.get_u64_le(), + xmin: buf.get_u64_le(), + catalog_xmin: buf.get_u64_le(), + } + } } impl HotStandbyFeedback { - fn parse(body : &Bytes) -> HotStandbyFeedback { - HotStandbyFeedback { - ts: BigEndian::read_u64(&body[0..8]), - xmin: BigEndian::read_u64(&body[8..16]), - catalog_xmin: BigEndian::read_u64(&body[16..24]) - } - } + fn parse(body: &Bytes) -> HotStandbyFeedback { + HotStandbyFeedback { + ts: BigEndian::read_u64(&body[0..8]), + xmin: BigEndian::read_u64(&body[8..16]), + catalog_xmin: BigEndian::read_u64(&body[16..24]), + } + } } impl Serializer for SafeKeeperRequest { - fn pack(&self, buf : &mut BytesMut) { - self.sender_id.pack(buf); - buf.put_u64_le(self.begin_lsn); - buf.put_u64_le(self.end_lsn); - buf.put_u64_le(self.restart_lsn); - buf.put_u64_le(self.commit_lsn); - } - fn unpack(buf : &mut BytesMut) -> SafeKeeperRequest { - SafeKeeperRequest { - sender_id: NodeId::unpack(buf), - begin_lsn: buf.get_u64_le(), - end_lsn: buf.get_u64_le(), - restart_lsn: buf.get_u64_le(), - commit_lsn: buf.get_u64_le(), - } - } + fn pack(&self, buf: &mut BytesMut) { + self.sender_id.pack(buf); + buf.put_u64_le(self.begin_lsn); + buf.put_u64_le(self.end_lsn); + buf.put_u64_le(self.restart_lsn); + buf.put_u64_le(self.commit_lsn); + } + fn unpack(buf: &mut BytesMut) -> SafeKeeperRequest { + SafeKeeperRequest { + sender_id: NodeId::unpack(buf), + begin_lsn: buf.get_u64_le(), + end_lsn: buf.get_u64_le(), + restart_lsn: buf.get_u64_le(), + commit_lsn: buf.get_u64_le(), + } + } } impl Serializer for SafeKeeperResponse { - fn pack(&self, buf : &mut BytesMut) { - buf.put_u64_le(self.epoch); - buf.put_u64_le(self.flush_lsn); - self.hs_feedback.pack(buf); - } - fn unpack(buf : &mut BytesMut) -> SafeKeeperResponse { - SafeKeeperResponse { - epoch: buf.get_u64_le(), - flush_lsn: buf.get_u64_le(), - hs_feedback: HotStandbyFeedback::unpack(buf), - } - } + fn pack(&self, buf: &mut BytesMut) { + buf.put_u64_le(self.epoch); + buf.put_u64_le(self.flush_lsn); + self.hs_feedback.pack(buf); + } + fn unpack(buf: &mut BytesMut) -> SafeKeeperResponse { + SafeKeeperResponse { + epoch: buf.get_u64_le(), + flush_lsn: buf.get_u64_le(), + hs_feedback: HotStandbyFeedback::unpack(buf), + } + } } lazy_static! { - pub static ref SELF : WalAcceptor = WalAcceptor::new(); + pub static ref SELF: WalAcceptor = WalAcceptor::new(); } pub fn thread_main(conf: WalAcceptorConf) { - // Create a new thread pool // // FIXME: keep it single-threaded for now, make it easier to debug with gdb, // and we're not concerned with performance yet. //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap(); + let runtime = runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); info!("Starting wal acceptor on {}", conf.listen_addr); - SELF.load_control_file(&conf); + SELF.load_control_file(&conf); runtime.block_on(async { - let _unused = SELF.main_loop(&conf).await; + let _unused = SELF.main_loop(&conf).await; }); } -impl WalAcceptor -{ - pub fn new() -> WalAcceptor { - let shared_state = SharedState { - commit_lsn: 0, - info: SafeKeeperInfo::new(), - control_file : None, - hs_feedback : HotStandbyFeedback { ts: 0, xmin: u64::MAX, catalog_xmin: u64::MAX } - }; - WalAcceptor { - mutex : Mutex::new(shared_state), - cond : Notify::new() - } - } +impl WalAcceptor { + pub fn new() -> WalAcceptor { + let shared_state = SharedState { + commit_lsn: 0, + info: SafeKeeperInfo::new(), + control_file: None, + hs_feedback: HotStandbyFeedback { + ts: 0, + xmin: u64::MAX, + catalog_xmin: u64::MAX, + }, + }; + WalAcceptor { + mutex: Mutex::new(shared_state), + cond: Notify::new(), + } + } - // Notify caught-up WAL senders about new WAL data received - fn notify_wal_senders(&self, commit_lsn : XLogRecPtr) { - let mut shared_state = self.mutex.lock().unwrap(); - if shared_state.commit_lsn < commit_lsn { - shared_state.commit_lsn = commit_lsn; - self.cond.notify_waiters(); - } - } + // Notify caught-up WAL senders about new WAL data received + fn notify_wal_senders(&self, commit_lsn: XLogRecPtr) { + let mut shared_state = self.mutex.lock().unwrap(); + if shared_state.commit_lsn < commit_lsn { + shared_state.commit_lsn = commit_lsn; + self.cond.notify_waiters(); + } + } - fn _stop_wal_senders(&self) { - self.notify_wal_senders(END_REPLICATION_MARKER); - } + fn _stop_wal_senders(&self) { + self.notify_wal_senders(END_REPLICATION_MARKER); + } - fn get_info(&self) -> SafeKeeperInfo { - return self.mutex.lock().unwrap().info; - } + fn get_info(&self) -> SafeKeeperInfo { + return self.mutex.lock().unwrap().info; + } - fn set_info(&self, info : &SafeKeeperInfo) { - self.mutex.lock().unwrap().info = *info; - } + fn set_info(&self, info: &SafeKeeperInfo) { + self.mutex.lock().unwrap().info = *info; + } - // Accumulate hot standby feedbacks from replicas - fn add_hs_feedback(&self, feedback : HotStandbyFeedback) { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.hs_feedback.xmin = min(shared_state.hs_feedback.xmin, feedback.xmin); - shared_state.hs_feedback.catalog_xmin = min(shared_state.hs_feedback.catalog_xmin, feedback.catalog_xmin); - shared_state.hs_feedback.ts = max(shared_state.hs_feedback.ts, feedback.ts); - } + // Accumulate hot standby feedbacks from replicas + fn add_hs_feedback(&self, feedback: HotStandbyFeedback) { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.hs_feedback.xmin = min(shared_state.hs_feedback.xmin, feedback.xmin); + shared_state.hs_feedback.catalog_xmin = + min(shared_state.hs_feedback.catalog_xmin, feedback.catalog_xmin); + shared_state.hs_feedback.ts = max(shared_state.hs_feedback.ts, feedback.ts); + } - fn get_hs_feedback(&self) -> HotStandbyFeedback { - let shared_state = self.mutex.lock().unwrap(); - return shared_state.hs_feedback; - } + fn get_hs_feedback(&self) -> HotStandbyFeedback { + let shared_state = self.mutex.lock().unwrap(); + return shared_state.hs_feedback; + } - // Load and lock control file (prevent running more than one instane of safekeeper */ - fn load_control_file(&self, conf: &WalAcceptorConf) { - let control_file_path = conf.data_dir.join(CONTROL_FILE_NAME); - match OpenOptions::new().read(true).write(true).create(true).open(&control_file_path) { - Ok(file) => { - // Lock file to prevent two or more active wal_acceptors - match file.try_lock_exclusive() { - Ok(()) => {}, - Err(e) => { - panic!("Control file {:?} is locked by some other process: {}", - &control_file_path, e); - } - } - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.control_file = Some(file); + // Load and lock control file (prevent running more than one instane of safekeeper */ + fn load_control_file(&self, conf: &WalAcceptorConf) { + let control_file_path = conf.data_dir.join(CONTROL_FILE_NAME); + match OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&control_file_path) + { + Ok(file) => { + // Lock file to prevent two or more active wal_acceptors + match file.try_lock_exclusive() { + Ok(()) => {} + Err(e) => { + panic!( + "Control file {:?} is locked by some other process: {}", + &control_file_path, e + ); + } + } + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.control_file = Some(file); - const SIZE : usize = mem::size_of::(); - let mut buf = [0u8; SIZE]; - if shared_state.control_file.as_mut().unwrap().read_exact(&mut buf).is_ok() { - let mut input = BytesMut::new(); - input.extend_from_slice(&buf); - let my_info = SafeKeeperInfo::unpack(&mut input); + const SIZE: usize = mem::size_of::(); + let mut buf = [0u8; SIZE]; + if shared_state + .control_file + .as_mut() + .unwrap() + .read_exact(&mut buf) + .is_ok() + { + let mut input = BytesMut::new(); + input.extend_from_slice(&buf); + let my_info = SafeKeeperInfo::unpack(&mut input); - if my_info.magic != SK_MAGIC { - panic!("Invalid control file magic: {}", my_info.magic); - } - if my_info.format_version != SK_FORMAT_VERSION { - panic!("Incompatible format version: {} vs. {}", - my_info.format_version, SK_FORMAT_VERSION); - } - shared_state.info = my_info; - } - }, - Err(e) => { - panic!("Failed to open control file {:?}: {}", &control_file_path, e); - } - } - } + if my_info.magic != SK_MAGIC { + panic!("Invalid control file magic: {}", my_info.magic); + } + if my_info.format_version != SK_FORMAT_VERSION { + panic!( + "Incompatible format version: {} vs. {}", + my_info.format_version, SK_FORMAT_VERSION + ); + } + shared_state.info = my_info; + } + } + Err(e) => { + panic!( + "Failed to open control file {:?}: {}", + &control_file_path, e + ); + } + } + } - fn save_control_file(&self, sync : bool) -> Result<()> { - let mut buf = BytesMut::new(); - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.info.pack(&mut buf); + fn save_control_file(&self, sync: bool) -> Result<()> { + let mut buf = BytesMut::new(); + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.info.pack(&mut buf); - let file = shared_state.control_file.as_mut().unwrap(); - file.seek(SeekFrom::Start(0))?; - file.write_all(&mut buf[..])?; - if sync { - file.sync_all()?; - } - Ok(()) - } + let file = shared_state.control_file.as_mut().unwrap(); + file.seek(SeekFrom::Start(0))?; + file.write_all(&mut buf[..])?; + if sync { + file.sync_all()?; + } + Ok(()) + } - async fn main_loop(&'static self, conf : &WalAcceptorConf) -> Result<()> { + async fn main_loop(&'static self, conf: &WalAcceptorConf) -> Result<()> { let listener = TcpListener::bind(conf.listen_addr.to_string().as_str()).await?; loop { match listener.accept().await { - Ok((socket, peer_addr)) => { - debug!("accepted connection from {}", peer_addr); - socket.set_nodelay(true)?; - let mut conn = Connection::new(self, socket, &conf); - task::spawn(async move { - if let Err(err) = conn.run().await { - error!("error: {}", err); - } - }); - }, - Err(e) => error!("Failed to accept connection: {}", e) - } - } - } + Ok((socket, peer_addr)) => { + debug!("accepted connection from {}", peer_addr); + socket.set_nodelay(true)?; + let mut conn = Connection::new(self, socket, &conf); + task::spawn(async move { + if let Err(err) = conn.run().await { + error!("error: {}", err); + } + }); + } + Err(e) => error!("Failed to accept connection: {}", e), + } + } + } } - -impl Connection { - pub fn new(acceptor : &'static WalAcceptor, socket: TcpStream, conf: &WalAcceptorConf) -> Connection { +impl Connection { + pub fn new( + acceptor: &'static WalAcceptor, + socket: TcpStream, + conf: &WalAcceptorConf, + ) -> Connection { Connection { - acceptor: acceptor, + acceptor: acceptor, stream: socket, inbuf: BytesMut::with_capacity(10 * 1024), outbuf: BytesMut::with_capacity(10 * 1024), - init_done: false, - conf: conf.clone() - } - } + init_done: false, + conf: conf.clone(), + } + } async fn run(&mut self) -> Result<()> { - self.inbuf.resize(4, 0u8); - self.stream.read_exact(&mut self.inbuf[0..4]).await?; - let startup_pkg_len = BigEndian::read_u32(&mut self.inbuf[0..4]); - if startup_pkg_len == 0 { - self.receive_wal().await?; // internal protocol between wal_proposer and wal_acceptor - } else { - self.send_wal().await?; // libpq replication protocol between wal_acceptor and replicas/pagers - } - Ok(()) - } + self.inbuf.resize(4, 0u8); + self.stream.read_exact(&mut self.inbuf[0..4]).await?; + let startup_pkg_len = BigEndian::read_u32(&mut self.inbuf[0..4]); + if startup_pkg_len == 0 { + self.receive_wal().await?; // internal protocol between wal_proposer and wal_acceptor + } else { + self.send_wal().await?; // libpq replication protocol between wal_acceptor and replicas/pagers + } + Ok(()) + } - async fn read_req(&mut self) -> Result { - let size = mem::size_of::(); - self.inbuf.resize(size, 0u8); - self.stream.read_exact(&mut self.inbuf[0..size]).await?; - Ok(T::unpack(&mut self.inbuf)) - } + async fn read_req(&mut self) -> Result { + let size = mem::size_of::(); + self.inbuf.resize(size, 0u8); + self.stream.read_exact(&mut self.inbuf[0..size]).await?; + Ok(T::unpack(&mut self.inbuf)) + } - // Receive WAL from wal_proposer - async fn receive_wal(&mut self) -> Result<()> { - let mut my_info = self.acceptor.get_info(); - // Receive information about server - let server_info = self.read_req::().await?; - info!("Start handshake with wal_proposer {}", self.stream.peer_addr()?); + // Receive WAL from wal_proposer + async fn receive_wal(&mut self) -> Result<()> { + let mut my_info = self.acceptor.get_info(); + // Receive information about server + let server_info = self.read_req::().await?; + info!( + "Start handshake with wal_proposer {}", + self.stream.peer_addr()? + ); - /* Check protocol compatibility */ - if server_info.protocol_version != SK_PROTOCOL_VERSION { - io_error!("Incompatible protocol version {} vs. {}", - server_info.protocol_version, SK_PROTOCOL_VERSION); - } - /* Postgres upgrade is not treated as fatal error */ - if server_info.pg_version != my_info.server.pg_version - && my_info.server.pg_version != UNKNOWN_SERVER_VERSION - { - info!("Server version doesn't match {} vs. {}", - server_info.pg_version, my_info.server.pg_version); - } - /* Update information about server, but preserve locally stored node_id */ - let node_id = my_info.server.node_id; - my_info.server = server_info; - my_info.server.node_id = node_id; + /* Check protocol compatibility */ + if server_info.protocol_version != SK_PROTOCOL_VERSION { + io_error!( + "Incompatible protocol version {} vs. {}", + server_info.protocol_version, + SK_PROTOCOL_VERSION + ); + } + /* Postgres upgrade is not treated as fatal error */ + if server_info.pg_version != my_info.server.pg_version + && my_info.server.pg_version != UNKNOWN_SERVER_VERSION + { + info!( + "Server version doesn't match {} vs. {}", + server_info.pg_version, my_info.server.pg_version + ); + } + /* Update information about server, but preserve locally stored node_id */ + let node_id = my_info.server.node_id; + my_info.server = server_info; + my_info.server.node_id = node_id; - /* Calculate WAL end based on local data */ - let (flush_lsn,timeline) = self.find_end_of_wal(true); - my_info.flush_lsn = flush_lsn; - my_info.server.timeline = timeline; + /* Calculate WAL end based on local data */ + let (flush_lsn, timeline) = self.find_end_of_wal(true); + my_info.flush_lsn = flush_lsn; + my_info.server.timeline = timeline; - /* Report my identifier to proxy */ - self.start_sending(); - my_info.pack(&mut self.outbuf); - self.send().await?; + /* Report my identifier to proxy */ + self.start_sending(); + my_info.pack(&mut self.outbuf); + self.send().await?; - /* Wait for vote request */ - let prop = self.read_req::().await?; - /* This is Paxos check which should ensure that only one master can perform commits */ - if prop.node_id < my_info.server.node_id { - /* Send my node-id to inform proxy that it's candidate was rejected */ - self.start_sending(); - my_info.server.node_id.pack(&mut self.outbuf); - self.send().await?; - io_error!("Reject connection attempt with term {} because my term is {}", - prop.node_id.term, my_info.server.node_id.term); - } - my_info.server.node_id = prop.node_id; - self.acceptor.set_info(&my_info); - /* Need to persist our vote first */ - self.acceptor.save_control_file(true)?; + /* Wait for vote request */ + let prop = self.read_req::().await?; + /* This is Paxos check which should ensure that only one master can perform commits */ + if prop.node_id < my_info.server.node_id { + /* Send my node-id to inform proxy that it's candidate was rejected */ + self.start_sending(); + my_info.server.node_id.pack(&mut self.outbuf); + self.send().await?; + io_error!( + "Reject connection attempt with term {} because my term is {}", + prop.node_id.term, + my_info.server.node_id.term + ); + } + my_info.server.node_id = prop.node_id; + self.acceptor.set_info(&my_info); + /* Need to persist our vote first */ + self.acceptor.save_control_file(true)?; - let mut flushed_restart_lsn : XLogRecPtr = 0; - let wal_seg_size = server_info.wal_seg_size as usize; + let mut flushed_restart_lsn: XLogRecPtr = 0; + let wal_seg_size = server_info.wal_seg_size as usize; - /* Acknowledge the proposed candidate by returning it to the proxy */ - self.start_sending(); - prop.node_id.pack(&mut self.outbuf); - self.send().await?; + /* Acknowledge the proposed candidate by returning it to the proxy */ + self.start_sending(); + prop.node_id.pack(&mut self.outbuf); + self.send().await?; - info!("Start streaming from server {} address {:?}", server_info.system_id, self.stream.peer_addr()?); + info!( + "Start streaming from server {} address {:?}", + server_info.system_id, + self.stream.peer_addr()? + ); - // Main loop - loop { - let mut sync_control_file = false; + // Main loop + loop { + let mut sync_control_file = false; - /* Receive message header */ - let req = self.read_req::().await?; - if req.sender_id != my_info.server.node_id { - io_error!("Sender NodeId is changed"); - } - if req.begin_lsn == END_OF_STREAM { - info!("Server stops streaming"); - break; - } - let start_pos = req.begin_lsn; - let end_pos = req.end_lsn; - let rec_size = (end_pos - start_pos) as usize; - assert!(rec_size <= MAX_SEND_SIZE); + /* Receive message header */ + let req = self.read_req::().await?; + if req.sender_id != my_info.server.node_id { + io_error!("Sender NodeId is changed"); + } + if req.begin_lsn == END_OF_STREAM { + info!("Server stops streaming"); + break; + } + let start_pos = req.begin_lsn; + let end_pos = req.end_lsn; + let rec_size = (end_pos - start_pos) as usize; + assert!(rec_size <= MAX_SEND_SIZE); - /* Receive message body */ - self.inbuf.resize(rec_size, 0u8); - self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?; + /* Receive message body */ + self.inbuf.resize(rec_size, 0u8); + self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?; - /* Save message in file */ - self.write_wal_file(start_pos, timeline, wal_seg_size, &self.inbuf[0..rec_size])?; + /* Save message in file */ + self.write_wal_file(start_pos, timeline, wal_seg_size, &self.inbuf[0..rec_size])?; - my_info.restart_lsn = req.restart_lsn; - my_info.commit_lsn = req.commit_lsn; + my_info.restart_lsn = req.restart_lsn; + my_info.commit_lsn = req.commit_lsn; - /* - * Epoch switch happen when written WAL record cross the boundary. - * The boundary is maximum of last WAL position at this node (FlushLSN) and global - * maximum (vcl) determined by safekeeper_proxy during handshake. - * Switching epoch means that node completes recovery and start writing in the WAL new data. - */ - if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn,prop.vcl) { - info!("Switch to new epoch {}", prop.epoch); - my_info.epoch = prop.epoch; /* bump epoch */ - sync_control_file = true; - } - if end_pos > my_info.flush_lsn { - my_info.flush_lsn = end_pos; - } - /* - * Update restart LSN in control file. - * To avoid negative impact on performance of extra fsync, do it only - * when restart_lsn delta exceeds WAL segment size. - */ - sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn; - self.acceptor.save_control_file(sync_control_file)?; + /* + * Epoch switch happen when written WAL record cross the boundary. + * The boundary is maximum of last WAL position at this node (FlushLSN) and global + * maximum (vcl) determined by safekeeper_proxy during handshake. + * Switching epoch means that node completes recovery and start writing in the WAL new data. + */ + if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn, prop.vcl) { + info!("Switch to new epoch {}", prop.epoch); + my_info.epoch = prop.epoch; /* bump epoch */ + sync_control_file = true; + } + if end_pos > my_info.flush_lsn { + my_info.flush_lsn = end_pos; + } + /* + * Update restart LSN in control file. + * To avoid negative impact on performance of extra fsync, do it only + * when restart_lsn delta exceeds WAL segment size. + */ + sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn; + self.acceptor.save_control_file(sync_control_file)?; - if sync_control_file { - flushed_restart_lsn = my_info.restart_lsn; - } + if sync_control_file { + flushed_restart_lsn = my_info.restart_lsn; + } - /* Report flush position */ - //info!("Confirm LSN: {:X}/{:>08X}", (end_pos>>32) as u32, end_pos as u32); - let resp = SafeKeeperResponse { - epoch: my_info.epoch, - flush_lsn: end_pos, - hs_feedback: self.acceptor.get_hs_feedback() - }; - self.start_sending(); - resp.pack(&mut self.outbuf); - self.send().await?; + /* Report flush position */ + //info!("Confirm LSN: {:X}/{:>08X}", (end_pos>>32) as u32, end_pos as u32); + let resp = SafeKeeperResponse { + epoch: my_info.epoch, + flush_lsn: end_pos, + hs_feedback: self.acceptor.get_hs_feedback(), + }; + self.start_sending(); + resp.pack(&mut self.outbuf); + self.send().await?; - /* - * Ping wal sender that new data is available. - * FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper. - */ - self.acceptor.notify_wal_senders(min(req.commit_lsn, end_pos)); - } - Ok(()) - } + /* + * Ping wal sender that new data is available. + * FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper. + */ + self.acceptor + .notify_wal_senders(min(req.commit_lsn, end_pos)); + } + Ok(()) + } // // Read full message or return None if connection is closed @@ -679,15 +711,18 @@ impl Connection { if self.inbuf.is_empty() { return Ok(None); } else { - return Err(io::Error::new(io::ErrorKind::Other,"connection reset by peer")); + return Err(io::Error::new( + io::ErrorKind::Other, + "connection reset by peer", + )); } } } } - // - // Parse libpq message - // + // + // Parse libpq message + // fn parse_message(&mut self) -> Result> { if !self.init_done { FeStartupMessage::parse(&mut self.inbuf) @@ -696,27 +731,27 @@ impl Connection { } } - // - // Reset output buffer to start accumulating data of new message - // - fn start_sending(&mut self) { - self.outbuf.clear(); - } + // + // Reset output buffer to start accumulating data of new message + // + fn start_sending(&mut self) { + self.outbuf.clear(); + } - // - // Send buffered messages - // - async fn send(&mut self) -> Result<()> { - self.stream.write_all(&self.outbuf).await - } + // + // Send buffered messages + // + async fn send(&mut self) -> Result<()> { + self.stream.write_all(&self.outbuf).await + } - // - // Send WAL to replica or WAL sender using standard libpq replication protocol - // - async fn send_wal(&mut self) -> Result<()> { - info!("WAL sender to {:?} is started", self.stream.peer_addr()?); + // + // Send WAL to replica or WAL sender using standard libpq replication protocol + // + async fn send_wal(&mut self) -> Result<()> { + info!("WAL sender to {:?} is started", self.stream.peer_addr()?); loop { - self.start_sending(); + self.start_sending(); match self.read_message().await? { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); @@ -724,23 +759,23 @@ impl Connection { match m.kind { StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => { BeMessage::write(&mut self.outbuf, &BeMessage::Negotiate); - info!("SSL requested"); + info!("SSL requested"); self.send().await?; } StartupRequestCode::Normal => { - BeMessage::write(&mut self.outbuf, &BeMessage::AuthenticationOk); - BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); - self.send().await?; + BeMessage::write(&mut self.outbuf, &BeMessage::AuthenticationOk); + BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); + self.send().await?; self.init_done = true; - }, - StartupRequestCode::Cancel => return Ok(()) + } + StartupRequestCode::Cancel => return Ok(()), } - }, + } Some(FeMessage::Query(m)) => { if !self.process_query(&m).await? { - break; - } - }, + break; + } + } Some(FeMessage::Terminate) => { break; } @@ -749,265 +784,304 @@ impl Connection { break; } _ => { - return Err(io::Error::new(io::ErrorKind::Other,"unexpected message")); + return Err(io::Error::new(io::ErrorKind::Other, "unexpected message")); } } } - info!("WAL sender to {:?} is finished", self.stream.peer_addr()?); + info!("WAL sender to {:?} is finished", self.stream.peer_addr()?); Ok(()) } - // - // Handle IDENTIFY_SYSTEM replication command - // - async fn handle_identify_system(&mut self) -> Result { - let (start_pos,timeline) = self.find_end_of_wal(false); - let lsn = format!("{:X}/{:>08X}", (start_pos>>32) as u32, start_pos as u32); - let tli = timeline.to_string(); - let sysid = self.acceptor.get_info().server.system_id.to_string(); - let lsn_bytes = lsn.as_bytes(); - let tli_bytes = tli.as_bytes(); - let sysid_bytes = sysid.as_bytes(); + // + // Handle IDENTIFY_SYSTEM replication command + // + async fn handle_identify_system(&mut self) -> Result { + let (start_pos, timeline) = self.find_end_of_wal(false); + let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32); + let tli = timeline.to_string(); + let sysid = self.acceptor.get_info().server.system_id.to_string(); + let lsn_bytes = lsn.as_bytes(); + let tli_bytes = tli.as_bytes(); + let sysid_bytes = sysid.as_bytes(); - BeMessage::write(&mut self.outbuf, - &BeMessage::RowDescription(&[RowDescriptor{name: b"systemid\0", - typoid: 25, - typlen: -1}, - RowDescriptor{name: b"timeline\0", - typoid: 23, - typlen: 4}, - RowDescriptor{name: b"xlogpos\0", - typoid: 25, - typlen: -1}, - RowDescriptor{name: b"dbname\0", - typoid: 25, - typlen: -1}])); - BeMessage::write(&mut self.outbuf, &BeMessage::DataRow(&[Some(lsn_bytes),Some(tli_bytes),Some(sysid_bytes),None])); - BeMessage::write(&mut self.outbuf, &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM")); + BeMessage::write( + &mut self.outbuf, + &BeMessage::RowDescription(&[ + RowDescriptor { + name: b"systemid\0", + typoid: 25, + typlen: -1, + }, + RowDescriptor { + name: b"timeline\0", + typoid: 23, + typlen: 4, + }, + RowDescriptor { + name: b"xlogpos\0", + typoid: 25, + typlen: -1, + }, + RowDescriptor { + name: b"dbname\0", + typoid: 25, + typlen: -1, + }, + ]), + ); + BeMessage::write( + &mut self.outbuf, + &BeMessage::DataRow(&[Some(lsn_bytes), Some(tli_bytes), Some(sysid_bytes), None]), + ); + BeMessage::write( + &mut self.outbuf, + &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"), + ); BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); - self.send().await?; - Ok(true) - } + self.send().await?; + Ok(true) + } - // - // Handle START_REPLICATION replication command - // - async fn handle_start_replication(&mut self, cmd: &Bytes) -> Result { - let re = Regex::new(r"([[:xdigit:]]*)/([[:xdigit:]]*)").unwrap(); - let mut caps = re.captures_iter(str::from_utf8(&cmd[..]).unwrap()); - let cap = caps.next().unwrap(); - let mut start_pos : XLogRecPtr = (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?; - let stop_pos : XLogRecPtr = if let Some(cap) = caps.next() { - (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])? - } else { - 0 - }; - let wal_seg_size = self.acceptor.get_info().server.wal_seg_size as usize; - if wal_seg_size == 0 { - io_error!("Can not start replication before connecting to wal_proposer"); - } - let (wal_end,timeline) = self.find_end_of_wal(false); - if start_pos == 0 { - start_pos = wal_end; - } - info!("Start replication from {:X}/{:>08X} till {:X}/{:>08X}", - (start_pos>>32) as u32, start_pos as u32, - (stop_pos>>32) as u32, stop_pos as u32); - BeMessage::write(&mut self.outbuf, &BeMessage::Copy); - self.send().await?; + // + // Handle START_REPLICATION replication command + // + async fn handle_start_replication(&mut self, cmd: &Bytes) -> Result { + let re = Regex::new(r"([[:xdigit:]]*)/([[:xdigit:]]*)").unwrap(); + let mut caps = re.captures_iter(str::from_utf8(&cmd[..]).unwrap()); + let cap = caps.next().unwrap(); + let mut start_pos: XLogRecPtr = (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?; + let stop_pos: XLogRecPtr = if let Some(cap) = caps.next() { + (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])? + } else { + 0 + }; + let wal_seg_size = self.acceptor.get_info().server.wal_seg_size as usize; + if wal_seg_size == 0 { + io_error!("Can not start replication before connecting to wal_proposer"); + } + let (wal_end, timeline) = self.find_end_of_wal(false); + if start_pos == 0 { + start_pos = wal_end; + } + info!( + "Start replication from {:X}/{:>08X} till {:X}/{:>08X}", + (start_pos >> 32) as u32, + start_pos as u32, + (stop_pos >> 32) as u32, + stop_pos as u32 + ); + BeMessage::write(&mut self.outbuf, &BeMessage::Copy); + self.send().await?; - /* - * Always start streaming at the beginning of a segment - */ - start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64; + /* + * Always start streaming at the beginning of a segment + */ + start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64; - let mut end_pos : XLogRecPtr; - let mut commit_lsn : XLogRecPtr; - let mut wal_file : Option = None; - self.outbuf.resize(LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + MAX_SEND_SIZE, 0u8); - loop { - /* Wait until we have some data to stream */ - if stop_pos != 0 { - /* recovery mode: stream up to the specified LSN (VCL) */ - if start_pos >= stop_pos { - /* recovery finished */ - break; - } - end_pos = stop_pos; - } else { - /* normal mode */ - loop { - // Rust doesn't allow to grab async result from mutex scope - let notified = self.acceptor.cond.notified(); - { - let shared_state = self.acceptor.mutex.lock().unwrap(); - commit_lsn = shared_state.commit_lsn; - if start_pos < commit_lsn { - end_pos = commit_lsn; - break; - } - } - notified.await; - } - } - if end_pos == END_REPLICATION_MARKER { - break; - } - // Try to fetch replica's feedback - match self.stream.try_read_buf(&mut self.inbuf) { - Ok(0) => break, - Ok(_) => { - match self.parse_message()? { - Some(FeMessage::CopyData(m)) => - self.acceptor.add_hs_feedback(HotStandbyFeedback::parse(&m.body)), - _ => {} - } - }, - Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { - }, - Err(e) => { - return Err(e.into()); - } - } + let mut end_pos: XLogRecPtr; + let mut commit_lsn: XLogRecPtr; + let mut wal_file: Option = None; + self.outbuf + .resize(LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + MAX_SEND_SIZE, 0u8); + loop { + /* Wait until we have some data to stream */ + if stop_pos != 0 { + /* recovery mode: stream up to the specified LSN (VCL) */ + if start_pos >= stop_pos { + /* recovery finished */ + break; + } + end_pos = stop_pos; + } else { + /* normal mode */ + loop { + // Rust doesn't allow to grab async result from mutex scope + let notified = self.acceptor.cond.notified(); + { + let shared_state = self.acceptor.mutex.lock().unwrap(); + commit_lsn = shared_state.commit_lsn; + if start_pos < commit_lsn { + end_pos = commit_lsn; + break; + } + } + notified.await; + } + } + if end_pos == END_REPLICATION_MARKER { + break; + } + // Try to fetch replica's feedback + match self.stream.try_read_buf(&mut self.inbuf) { + Ok(0) => break, + Ok(_) => match self.parse_message()? { + Some(FeMessage::CopyData(m)) => self + .acceptor + .add_hs_feedback(HotStandbyFeedback::parse(&m.body)), + _ => {} + }, + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {} + Err(e) => { + return Err(e.into()); + } + } - /* Open file if not opened yet */ - let curr_file = wal_file.take(); - let mut file : File; - if let Some(opened_file) = curr_file { - file = opened_file; - } else { - let segno = XLByteToSeg(start_pos, wal_seg_size); - let wal_file_name = XLogFileName(timeline, segno, wal_seg_size); - let wal_file_path = self.conf.data_dir.join(wal_file_name.clone() + ".partial"); - if let Ok(opened_file) = File::open(&wal_file_path) { - file = opened_file; - } else { - let wal_file_path = self.conf.data_dir.join(wal_file_name); - match File::open(&wal_file_path) { - Ok(opened_file) => file = opened_file, - Err(e) => { - error!("Failed to open log file {:?}: {}", &wal_file_path, e); - return Err(e.into()); - } - } - } - } - let send_size = min((end_pos - start_pos) as usize, MAX_SEND_SIZE); - let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size; - let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE; - let data_end = data_start + send_size; - file.read_exact(&mut self.outbuf[data_start..data_end])?; - self.outbuf[0] = b'd'; - BigEndian::write_u32(&mut self.outbuf[1..5], (msg_size - LIBPQ_MSG_SIZE_OFFS) as u32); - self.outbuf[5] = b'w'; - BigEndian::write_u64(&mut self.outbuf[6..14], start_pos); - BigEndian::write_u64(&mut self.outbuf[14..22], end_pos); - BigEndian::write_u64(&mut self.outbuf[22..30], get_current_timestamp()); + /* Open file if not opened yet */ + let curr_file = wal_file.take(); + let mut file: File; + if let Some(opened_file) = curr_file { + file = opened_file; + } else { + let segno = XLByteToSeg(start_pos, wal_seg_size); + let wal_file_name = XLogFileName(timeline, segno, wal_seg_size); + let wal_file_path = self.conf.data_dir.join(wal_file_name.clone() + ".partial"); + if let Ok(opened_file) = File::open(&wal_file_path) { + file = opened_file; + } else { + let wal_file_path = self.conf.data_dir.join(wal_file_name); + match File::open(&wal_file_path) { + Ok(opened_file) => file = opened_file, + Err(e) => { + error!("Failed to open log file {:?}: {}", &wal_file_path, e); + return Err(e.into()); + } + } + } + } + let send_size = min((end_pos - start_pos) as usize, MAX_SEND_SIZE); + let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size; + let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE; + let data_end = data_start + send_size; + file.read_exact(&mut self.outbuf[data_start..data_end])?; + self.outbuf[0] = b'd'; + BigEndian::write_u32( + &mut self.outbuf[1..5], + (msg_size - LIBPQ_MSG_SIZE_OFFS) as u32, + ); + self.outbuf[5] = b'w'; + BigEndian::write_u64(&mut self.outbuf[6..14], start_pos); + BigEndian::write_u64(&mut self.outbuf[14..22], end_pos); + BigEndian::write_u64(&mut self.outbuf[22..30], get_current_timestamp()); - self.stream.write_all(&self.outbuf[0..msg_size]).await?; - start_pos += send_size as u64; + self.stream.write_all(&self.outbuf[0..msg_size]).await?; + start_pos += send_size as u64; - if XLogSegmentOffset(start_pos, wal_seg_size) != 0 { - wal_file = Some(file); - } - } + if XLogSegmentOffset(start_pos, wal_seg_size) != 0 { + wal_file = Some(file); + } + } Ok(false) - } + } - async fn process_query(&mut self, q : &FeQueryMessage) -> Result { + async fn process_query(&mut self, q: &FeQueryMessage) -> Result { trace!("got query {:?}", q.body); if q.body.starts_with(b"IDENTIFY_SYSTEM") { - self.handle_identify_system().await + self.handle_identify_system().await } else if q.body.starts_with(b"START_REPLICATION") { - self.handle_start_replication(&q.body).await - } else { - io_error!("Unexpected command {:?}", q.body); - } - } + self.handle_start_replication(&q.body).await + } else { + io_error!("Unexpected command {:?}", q.body); + } + } - fn write_wal_file(&self, startpos : XLogRecPtr, timeline : TimeLineID, wal_seg_size : usize, buf: &[u8]) ->Result<()> { - let mut bytes_left : usize = buf.len(); - let mut bytes_written : usize = 0; - let mut partial; - let mut start_pos = startpos; - const ZERO_BLOCK : &'static[u8] = &[0u8; XLOG_BLCKSZ]; + fn write_wal_file( + &self, + startpos: XLogRecPtr, + timeline: TimeLineID, + wal_seg_size: usize, + buf: &[u8], + ) -> Result<()> { + let mut bytes_left: usize = buf.len(); + let mut bytes_written: usize = 0; + let mut partial; + let mut start_pos = startpos; + const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ]; - /* Extract WAL location for this block */ - let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize; + /* Extract WAL location for this block */ + let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize; - while bytes_left != 0 { - let bytes_to_write; + while bytes_left != 0 { + let bytes_to_write; - /* - * If crossing a WAL boundary, only write up until we reach wal - * segment size. - */ - if xlogoff + bytes_left > wal_seg_size { - bytes_to_write = wal_seg_size - xlogoff; - } else { - bytes_to_write = bytes_left; - } + /* + * If crossing a WAL boundary, only write up until we reach wal + * segment size. + */ + if xlogoff + bytes_left > wal_seg_size { + bytes_to_write = wal_seg_size - xlogoff; + } else { + bytes_to_write = bytes_left; + } - /* Open file */ - let segno = XLByteToSeg(start_pos, wal_seg_size); - let wal_file_name = XLogFileName(timeline, segno, wal_seg_size); - let wal_file_path = self.conf.data_dir.join(wal_file_name.clone()); - let wal_file_partial_path = self.conf.data_dir.join(wal_file_name.clone() + ".partial"); + /* Open file */ + let segno = XLByteToSeg(start_pos, wal_seg_size); + let wal_file_name = XLogFileName(timeline, segno, wal_seg_size); + let wal_file_path = self.conf.data_dir.join(wal_file_name.clone()); + let wal_file_partial_path = self.conf.data_dir.join(wal_file_name.clone() + ".partial"); - { - let mut wal_file : File; - /* Try to open already completed segment */ - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { - wal_file = file; - partial = false; - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { - /* Try to open existed partial file */ - wal_file = file; - partial = true; - } else { - /* Create and fill new partial file */ - partial = true; - match OpenOptions::new().create(true).write(true).open(&wal_file_partial_path) { - Ok(mut file) => { - for _ in 0..(wal_seg_size/XLOG_BLCKSZ) { - file.write_all(&ZERO_BLOCK)?; - } - wal_file = file; - }, - Err(e) => { - error!("Failed to open log file {:?}: {}", &wal_file_path, e); - return Err(e.into()); - } - } - } - wal_file.seek(SeekFrom::Start(xlogoff as u64))?; - wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; + { + let mut wal_file: File; + /* Try to open already completed segment */ + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + wal_file = file; + partial = false; + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) + { + /* Try to open existed partial file */ + wal_file = file; + partial = true; + } else { + /* Create and fill new partial file */ + partial = true; + match OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + { + Ok(mut file) => { + for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { + file.write_all(&ZERO_BLOCK)?; + } + wal_file = file; + } + Err(e) => { + error!("Failed to open log file {:?}: {}", &wal_file_path, e); + return Err(e.into()); + } + } + } + wal_file.seek(SeekFrom::Start(xlogoff as u64))?; + wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; - // Flush file is not prohibited - if !self.conf.no_sync { - wal_file.sync_all()?; - } - } - /* Write was successful, advance our position */ - bytes_written += bytes_to_write; - bytes_left -= bytes_to_write; - start_pos += bytes_to_write as u64; - xlogoff += bytes_to_write; + // Flush file is not prohibited + if !self.conf.no_sync { + wal_file.sync_all()?; + } + } + /* Write was successful, advance our position */ + bytes_written += bytes_to_write; + bytes_left -= bytes_to_write; + start_pos += bytes_to_write as u64; + xlogoff += bytes_to_write; - /* Did we reach the end of a WAL segment? */ - if XLogSegmentOffset(start_pos, wal_seg_size) == 0 { - xlogoff = 0; - if partial { - fs::rename(&wal_file_partial_path, &wal_file_path)?; - } - } - } - Ok(()) - } + /* Did we reach the end of a WAL segment? */ + if XLogSegmentOffset(start_pos, wal_seg_size) == 0 { + xlogoff = 0; + if partial { + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } + } + } + Ok(()) + } - // Find last WAL record. If "precise" is false then just locatelast partial segment - fn find_end_of_wal(&self, precise : bool) -> (XLogRecPtr, TimeLineID) { - find_end_of_wal(&self.conf.data_dir, self.acceptor.get_info().server.wal_seg_size as usize, precise) - } + // Find last WAL record. If "precise" is false then just locatelast partial segment + fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) { + find_end_of_wal( + &self.conf.data_dir, + self.acceptor.get_info().server.wal_seg_size as usize, + precise, + ) + } } diff --git a/walkeeper/src/xlog_utils.rs b/walkeeper/src/xlog_utils.rs index 57a8fa3c88..51db9681a6 100644 --- a/walkeeper/src/xlog_utils.rs +++ b/walkeeper/src/xlog_utils.rs @@ -1,229 +1,254 @@ -use std::fs::{self,File}; -use std::time::SystemTime; -use std::path::PathBuf; -use std::cmp::min; -use std::io::prelude::*; -use byteorder::{LittleEndian, ByteOrder}; +use byteorder::{ByteOrder, LittleEndian}; use crc32c::*; use log::*; +use std::cmp::min; +use std::fs::{self, File}; +use std::io::prelude::*; +use std::path::PathBuf; +use std::time::SystemTime; -pub const XLOG_FNAME_LEN : usize = 24; -pub const XLOG_BLCKSZ : usize = 8192; -pub const XLP_FIRST_IS_CONTRECORD : u16 = 0x0001; -pub const XLOG_PAGE_MAGIC : u16 = 0xD109; -pub const XLP_REM_LEN_OFFS : usize = 2+2+4+8; -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD : usize = XLP_REM_LEN_OFFS + 4 + 4; -pub const XLOG_SIZE_OF_XLOG_LONG_PHD : usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4; -pub const XLOG_RECORD_CRC_OFFS : usize = 4+4+8+1+1+2; -pub const XLOG_SIZE_OF_XLOG_RECORD : usize = XLOG_RECORD_CRC_OFFS+4; +pub const XLOG_FNAME_LEN: usize = 24; +pub const XLOG_BLCKSZ: usize = 8192; +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLOG_PAGE_MAGIC: u16 = 0xD109; +pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4; +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4; +pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4; pub type XLogRecPtr = u64; pub type TimeLineID = u32; pub type TimestampTz = u64; pub type XLogSegNo = u64; #[allow(non_snake_case)] -pub fn XLogSegmentOffset(xlogptr : XLogRecPtr, wal_segsz_bytes : usize) -> u32 { - return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); } #[allow(non_snake_case)] -pub fn XLogSegmentsPerXLogId(wal_segsz_bytes : usize) -> XLogSegNo { - return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; } #[allow(non_snake_case)] -pub fn XLByteToSeg(xlogptr : XLogRecPtr, wal_segsz_bytes : usize) -> XLogSegNo { - return xlogptr / wal_segsz_bytes as u64; +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; } #[allow(non_snake_case)] -pub fn XLogSegNoOffsetToRecPtr(segno: XLogSegNo, offset:u32, wal_segsz_bytes: usize) -> XLogRecPtr { - return segno * (wal_segsz_bytes as u64) + (offset as u64); +pub fn XLogSegNoOffsetToRecPtr( + segno: XLogSegNo, + offset: u32, + wal_segsz_bytes: usize, +) -> XLogRecPtr { + return segno * (wal_segsz_bytes as u64) + (offset as u64); } #[allow(non_snake_case)] -pub fn XLogFileName(tli : TimeLineID, logSegNo : XLogSegNo, wal_segsz_bytes : usize) -> String { - return format!("{:>08X}{:>08X}{:>08X}", - tli, - logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), - logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)); +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); } #[allow(non_snake_case)] -pub fn XLogFromFileName(fname:&str, wal_seg_size: usize) -> (XLogSegNo,TimeLineID) { - let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); - let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; - let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; - return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); } #[allow(non_snake_case)] -pub fn IsXLogFileName(fname:&str) -> bool { - return fname.len() == XLOG_FNAME_LEN - && fname.chars().all(|c| c.is_ascii_hexdigit()); +pub fn IsXLogFileName(fname: &str) -> bool { + return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } #[allow(non_snake_case)] -pub fn IsPartialXLogFileName(fname:&str) -> bool { - return fname.ends_with(".partial") - && IsXLogFileName(&fname[0..fname.len()-8]); +pub fn IsPartialXLogFileName(fname: &str) -> bool { + return fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]); } -pub fn get_current_timestamp() -> TimestampTz -{ - const UNIX_EPOCH_JDATE : u64 = 2440588; /* == date2j(1970, 1, 1) */ - const POSTGRES_EPOCH_JDATE : u64 = 2451545; /* == date2j(2000, 1, 1) */ - const SECS_PER_DAY : u64 = 86400; - const USECS_PER_SEC : u64 = 1000000; - match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { - Ok(n) => (n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) * USECS_PER_SEC + n.subsec_micros() as u64, - Err(_) => panic!("SystemTime before UNIX EPOCH!"), - } +pub fn get_current_timestamp() -> TimestampTz { + const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ + const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ + const SECS_PER_DAY: u64 = 86400; + const USECS_PER_SEC: u64 = 1000000; + match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { + Ok(n) => { + (n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) + * USECS_PER_SEC + + n.subsec_micros() as u64 + } + Err(_) => panic!("SystemTime before UNIX EPOCH!"), + } } -fn find_end_of_wal_segment(data_dir: &PathBuf, segno: XLogSegNo, tli: TimeLineID, wal_seg_size: usize) -> u32 { - let mut offs : usize = 0; - let mut contlen : usize = 0; - let mut wal_crc : u32 = 0; - let mut crc : u32 = 0; - let mut rec_offs : usize = 0; - let mut buf = [0u8;XLOG_BLCKSZ]; - let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos : usize = 0; - let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); - let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; +fn find_end_of_wal_segment( + data_dir: &PathBuf, + segno: XLogSegNo, + tli: TimeLineID, + wal_seg_size: usize, +) -> u32 { + let mut offs: usize = 0; + let mut contlen: usize = 0; + let mut wal_crc: u32 = 0; + let mut crc: u32 = 0; + let mut rec_offs: usize = 0; + let mut buf = [0u8; XLOG_BLCKSZ]; + let file_name = XLogFileName(tli, segno, wal_seg_size); + let mut last_valid_rec_pos: usize = 0; + let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); + let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; - while offs < wal_seg_size { - if offs % XLOG_BLCKSZ == 0 { - if let Ok(bytes_read) = file.read(&mut buf) { - if bytes_read != buf.len() { - break; - } - } else { - break; - } - let xlp_magic = LittleEndian::read_u16(&buf[0..2]); - let xlp_info = LittleEndian::read_u16(&buf[2..4]); - let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS+4]); - if xlp_magic != XLOG_PAGE_MAGIC { - info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic); - break; - } - if offs == 0 { - offs = XLOG_SIZE_OF_XLOG_LONG_PHD; - if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { - offs += ((xlp_rem_len + 7) & !7) as usize; - } - } else { - offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; - } + while offs < wal_seg_size { + if offs % XLOG_BLCKSZ == 0 { + if let Ok(bytes_read) = file.read(&mut buf) { + if bytes_read != buf.len() { + break; + } + } else { + break; + } + let xlp_magic = LittleEndian::read_u16(&buf[0..2]); + let xlp_info = LittleEndian::read_u16(&buf[2..4]); + let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); + if xlp_magic != XLOG_PAGE_MAGIC { + info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic); + break; + } + if offs == 0 { + offs = XLOG_SIZE_OF_XLOG_LONG_PHD; + if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { + offs += ((xlp_rem_len + 7) & !7) as usize; + } + } else { + offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; + } } else if contlen == 0 { - let page_offs = offs % XLOG_BLCKSZ; - let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs+4]) as usize; - if xl_tot_len == 0 { - break; - } - last_valid_rec_pos = offs; - offs += 4; - rec_offs = 4; + let page_offs = offs % XLOG_BLCKSZ; + let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; + if xl_tot_len == 0 { + break; + } + last_valid_rec_pos = offs; + offs += 4; + rec_offs = 4; contlen = xl_tot_len - 4; - rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs+4]); + rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); } else { - let page_offs = offs % XLOG_BLCKSZ; + let page_offs = offs % XLOG_BLCKSZ; // we're continuing a record, possibly from previous page. let pageleft = XLOG_BLCKSZ - page_offs; // read the rest of the record, or as much as fits on this page. let n = min(contlen, pageleft); - if rec_offs < XLOG_RECORD_CRC_OFFS { - let len = min(XLOG_RECORD_CRC_OFFS-rec_offs, n); - rec_hdr[rec_offs..rec_offs+len].copy_from_slice(&buf[page_offs..page_offs+len]); - } - if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { - let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; - wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs+4]); - crc = crc32c_append(0, &buf[crc_offs+4..page_offs+n]); - crc = !crc; - } else { - crc ^= 0xFFFFFFFFu32; - crc = crc32c_append(crc, &buf[page_offs..page_offs+n]); - crc = !crc; - } - rec_offs += n; + if rec_offs < XLOG_RECORD_CRC_OFFS { + let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); + rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); + } + if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { + let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; + wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); + crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); + crc = !crc; + } else { + crc ^= 0xFFFFFFFFu32; + crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); + crc = !crc; + } + rec_offs += n; offs += n; contlen -= n; if contlen == 0 { - crc = !crc; - crc = crc32c_append(crc, &rec_hdr); - offs = (offs + 7) & !7; // pad on 8 bytes boundary */ - if crc == wal_crc { - last_valid_rec_pos = offs; - } else { - info!("CRC mismatch {} vs {} at {}", crc, wal_crc, last_valid_rec_pos); - break; - } + crc = !crc; + crc = crc32c_append(crc, &rec_hdr); + offs = (offs + 7) & !7; // pad on 8 bytes boundary */ + if crc == wal_crc { + last_valid_rec_pos = offs; + } else { + info!( + "CRC mismatch {} vs {} at {}", + crc, wal_crc, last_valid_rec_pos + ); + break; + } } } - } - return last_valid_rec_pos as u32; + } + return last_valid_rec_pos as u32; } -pub fn find_end_of_wal(data_dir: &PathBuf, wal_seg_size:usize, precise:bool) -> (XLogRecPtr,TimeLineID) { - let mut high_segno : XLogSegNo = 0; - let mut high_tli : TimeLineID = 0; - let mut high_ispartial = false; +pub fn find_end_of_wal( + data_dir: &PathBuf, + wal_seg_size: usize, + precise: bool, +) -> (XLogRecPtr, TimeLineID) { + let mut high_segno: XLogSegNo = 0; + let mut high_tli: TimeLineID = 0; + let mut high_ispartial = false; - for entry in fs::read_dir(data_dir).unwrap() { - if let Ok(entry) = entry { - let ispartial : bool; - let entry_name = entry.file_name(); - let fname = entry_name.to_str().unwrap(); - /* - * Check if the filename looks like an xlog file, or a .partial file. - */ - if IsXLogFileName(fname) { - ispartial = false; - } else if IsPartialXLogFileName(fname) { - ispartial = true; - } else { - continue; - } - let (segno,tli) = XLogFromFileName(fname, wal_seg_size); - if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 { - continue; - } - if segno > high_segno || - (segno == high_segno && tli > high_tli) || - (segno == high_segno && tli == high_tli && high_ispartial && !ispartial) - { - high_segno = segno; - high_tli = tli; - high_ispartial = ispartial; - } - } - } - if high_segno > 0 { - let mut high_offs = 0; - /* - * Move the starting pointer to the start of the next segment, if the - * highest one we saw was completed. - */ - if !high_ispartial { - high_segno += 1; - } else if precise { /* otherwise locate last record in last partial segment */ - high_offs = find_end_of_wal_segment(data_dir, high_segno, high_tli, wal_seg_size); - } - let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size); - return (high_ptr,high_tli); - } - return (0,0); + for entry in fs::read_dir(data_dir).unwrap() { + if let Ok(entry) = entry { + let ispartial: bool; + let entry_name = entry.file_name(); + let fname = entry_name.to_str().unwrap(); + /* + * Check if the filename looks like an xlog file, or a .partial file. + */ + if IsXLogFileName(fname) { + ispartial = false; + } else if IsPartialXLogFileName(fname) { + ispartial = true; + } else { + continue; + } + let (segno, tli) = XLogFromFileName(fname, wal_seg_size); + if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 { + continue; + } + if segno > high_segno + || (segno == high_segno && tli > high_tli) + || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial) + { + high_segno = segno; + high_tli = tli; + high_ispartial = ispartial; + } + } + } + if high_segno > 0 { + let mut high_offs = 0; + /* + * Move the starting pointer to the start of the next segment, if the + * highest one we saw was completed. + */ + if !high_ispartial { + high_segno += 1; + } else if precise { + /* otherwise locate last record in last partial segment */ + high_offs = find_end_of_wal_segment(data_dir, high_segno, high_tli, wal_seg_size); + } + let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size); + return (high_ptr, high_tli); + } + return (0, 0); } pub fn main() { - let mut data_dir = PathBuf::new(); - data_dir.push("."); - let wal_seg_size = 16*1024*1024; - let (wal_end,tli) = find_end_of_wal(&data_dir, wal_seg_size, true); - println!("wal_end={:>08X}{:>08X}, tli={}", (wal_end >> 32) as u32, wal_end as u32, tli); + let mut data_dir = PathBuf::new(); + data_dir.push("."); + let wal_seg_size = 16 * 1024 * 1024; + let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true); + println!( + "wal_end={:>08X}{:>08X}, tli={}", + (wal_end >> 32) as u32, + wal_end as u32, + tli + ); }