From 533087fd5dbc8f23742b08d3dabd568ae034974a Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Mon, 19 Apr 2021 23:26:13 -0700 Subject: [PATCH 01/15] cargo fmt --- control_plane/src/compute.rs | 11 ++-- integration_tests/tests/test_wal_acceptor.rs | 62 ++++++++++---------- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b39d901be7..5c3ec5e816 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -113,7 +113,7 @@ impl ComputeControlPlane { pub fn new_test_master_node(&mut self) -> Arc { let node = self.new_vanilla_node(true).unwrap(); - println!("Create vanilla node at {:?}", node.address); + println!("Create vanilla node at {:?}", node.address); node.append_conf( "postgresql.conf", "synchronous_standby_names = 'safekeeper_proxy'\n", @@ -405,9 +405,12 @@ impl PostgresNode { .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") - .stderr(OpenOptions::new() - .append(true) - .open(self.env.data_dir.join("safepkeeper_proxy.log")).unwrap()) + .stderr( + OpenOptions::new() + .append(true) + .open(self.env.data_dir.join("safepkeeper_proxy.log")) + .unwrap(), + ) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index f4f7675b07..408f991bb2 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -51,41 +51,41 @@ fn test_multitenancy() { let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let mut nodes = Vec::new(); - let mut proxies = Vec::new(); - for _ in 0..N_NODES { - let node = compute_cplane.new_test_master_node(); - nodes.push(node); - nodes.last().unwrap().start().unwrap(); - proxies.push(nodes.last().unwrap().start_proxy(wal_acceptors.clone())); - } + let mut nodes = Vec::new(); + let mut proxies = Vec::new(); + for _ in 0..N_NODES { + let node = compute_cplane.new_test_master_node(); + nodes.push(node); + nodes.last().unwrap().start().unwrap(); + proxies.push(nodes.last().unwrap().start_proxy(wal_acceptors.clone())); + } // create schema - for node in &nodes { - node.safe_psql( - "postgres", - "CREATE TABLE t(key int primary key, value text)", - ); - } + for node in &nodes { + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + } - // Populate data - for node in &nodes { - node.safe_psql( - "postgres", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ); - } + // Populate data + for node in &nodes { + node.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ); + } - // Check data - for node in &nodes { - let count: i64 = node - .safe_psql("postgres", "SELECT sum(key) FROM t") - .first() - .unwrap() - .get(0); - println!("sum = {}", count); - assert_eq!(count, 5000050000); - } + // Check data + for node in &nodes { + let count: i64 = node + .safe_psql("postgres", "SELECT sum(key) FROM t") + .first() + .unwrap() + .get(0); + println!("sum = {}", count); + assert_eq!(count, 5000050000); + } } // Majority is always alive From b451ede199594c5a8ec5ef0ec34652b3427ad3a3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Apr 2021 14:40:03 +0300 Subject: [PATCH 02/15] Use rust bindgen for reading/writing the PostgreSQL control file. --- postgres_ffi/Cargo.toml | 19 +++++++++ postgres_ffi/build.rs | 38 ++++++++++++++++++ postgres_ffi/pg_control_ffi.h | 4 ++ postgres_ffi/src/lib.rs | 72 +++++++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+) create mode 100644 postgres_ffi/Cargo.toml create mode 100644 postgres_ffi/build.rs create mode 100644 postgres_ffi/pg_control_ffi.h create mode 100644 postgres_ffi/src/lib.rs diff --git a/postgres_ffi/Cargo.toml b/postgres_ffi/Cargo.toml new file mode 100644 index 0000000000..77cc5cf028 --- /dev/null +++ b/postgres_ffi/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "postgres_ffi" +version = "0.1.0" +authors = ["Heikki Linnakangas "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = "0.4.19" +rand = "0.8.3" +bytes = "1.0.1" +byteorder = "1.4.3" +anyhow = "1.0" +crc32c = "0.6.0" +hex = "0.4.3" + +[build-dependencies] +bindgen = "0.53.1" diff --git a/postgres_ffi/build.rs b/postgres_ffi/build.rs new file mode 100644 index 0000000000..95903bf051 --- /dev/null +++ b/postgres_ffi/build.rs @@ -0,0 +1,38 @@ +extern crate bindgen; + +use std::env; +use std::path::PathBuf; + +fn main() { + // Tell cargo to invalidate the built crate whenever the wrapper changes + println!("cargo:rerun-if-changed=pg_control_ffi.h"); + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header("pg_control_ffi.h") + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks)) + + .whitelist_type("ControlFileData") + .whitelist_var("PG_CONTROL_FILE_SIZE") + .whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") + .whitelist_type("DBState") + + .clang_arg("-I../vendor/postgres/src/include") + + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} diff --git a/postgres_ffi/pg_control_ffi.h b/postgres_ffi/pg_control_ffi.h new file mode 100644 index 0000000000..169e66977b --- /dev/null +++ b/postgres_ffi/pg_control_ffi.h @@ -0,0 +1,4 @@ +#include "c.h" +#include "catalog/pg_control.h" + +const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc); diff --git a/postgres_ffi/src/lib.rs b/postgres_ffi/src/lib.rs new file mode 100644 index 0000000000..b62114ea7d --- /dev/null +++ b/postgres_ffi/src/lib.rs @@ -0,0 +1,72 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +use bytes::{Buf, Bytes, BytesMut}; + +// sizeof(ControlFileData) +const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize; + +impl ControlFileData { + + // Initialize an all-zeros ControlFileData struct + pub fn new() -> ControlFileData { + let controlfile: ControlFileData; + + let b = [0u8; SIZEOF_CONTROLDATA]; + controlfile = unsafe { + std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) + }; + + return controlfile; + } +} + +pub fn decode_pg_control(buf: Bytes) -> Result { + + let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA]; + buf.clone().copy_to_slice(&mut b); + + let controlfile: ControlFileData; + + // TODO: verify CRC + let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC]; + data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]); + let expectedcrc = crc32c::crc32c(&data_without_crc); + + controlfile = unsafe { + std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) + }; + + if expectedcrc != controlfile.crc { + anyhow::bail!("invalid CRC in control file: expected {:08X}, was {:08X}", + expectedcrc, controlfile.crc); + } + + Ok(controlfile) +} + +pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes { + + let b: [u8; SIZEOF_CONTROLDATA]; + + b = unsafe { + std::mem::transmute::(controlfile) + }; + + // Recompute the CRC + let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC]; + data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]); + let newcrc = crc32c::crc32c(&data_without_crc); + + let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize); + + buf.extend_from_slice(&b[0..OFFSETOF_CRC]); + buf.extend_from_slice(&newcrc.to_ne_bytes()); + // Fill the rest of the control file with zeros. + buf.resize(PG_CONTROL_FILE_SIZE as usize, 0); + + return buf.into(); +} From c5d56ffe22a4b8e9f209e7ba72407dbda66ec7fc Mon Sep 17 00:00:00 2001 From: anastasia Date: Tue, 13 Apr 2021 22:11:49 +0300 Subject: [PATCH 03/15] Fix build: configure postgres in vendor/postgres directory for postgres_ffi --- pgbuild.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pgbuild.sh b/pgbuild.sh index 9d4c0baa65..8ba1e2cbf9 100755 --- a/pgbuild.sh +++ b/pgbuild.sh @@ -31,3 +31,9 @@ export DESTDIR=$REPO_ROOT/tmp_install echo "Installing postgres to $DESTDIR" make install -s + +#Configure postgres in src directory. We need it for postgres_ffi build +echo "Configuring postgres build in place" +cd ../../vendor/postgres/ +./configure CFLAGS='-O0' --enable-debug --enable-cassert \ + --enable-depend --with-libxml --prefix=/ > configure.log \ No newline at end of file From 583f64768f0312f3113780b326068a04cfafba3f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 Apr 2021 19:53:11 +0300 Subject: [PATCH 04/15] Fix wal safekeeper's reply to IDENTIFY_SYSTEM command. The PostgreSQL FE/BE RowDescription message was built incorrectly, the colums were sent in wrong order, and the command tag was missing NULL-terminator. With these fixes, 'psql' understands the reply and shows it correctly. --- walkeeper/src/pq_protocol.rs | 24 ++++++++++++------------ walkeeper/src/wal_service.rs | 9 +++++---- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/walkeeper/src/pq_protocol.rs b/walkeeper/src/pq_protocol.rs index 286d563b73..299b830d5e 100644 --- a/walkeeper/src/pq_protocol.rs +++ b/walkeeper/src/pq_protocol.rs @@ -146,20 +146,20 @@ impl<'a> BeMessage<'a> { BeMessage::RowDescription(rows) => { buf.put_u8(b'T'); - let total_len: u32 = rows - .iter() - .fold(0, |acc, row| acc + row.name.len() as u32 + 3 * (4 + 2)); - buf.put_u32(4 + 2 + total_len); + + let mut body = BytesMut::new(); + body.put_i16(rows.len() as i16); // # of fields for row in rows.iter() { - buf.put_i16(row.name.len() as i16); - buf.put_slice(row.name); - buf.put_i32(0); /* table oid */ - buf.put_i16(0); /* attnum */ - buf.put_u32(row.typoid); - buf.put_i16(row.typlen); - buf.put_i32(-1); /* typmod */ - buf.put_i16(0); /* format code */ + body.put_slice(row.name); + body.put_i32(0); /* table oid */ + body.put_i16(0); /* attnum */ + body.put_u32(row.typoid); + body.put_i16(row.typlen); + body.put_i32(-1); /* typmod */ + body.put_i16(0); /* format code */ } + buf.put_i32((4 + body.len()) as i32); // # of bytes, including len field itself + buf.put(body); } BeMessage::DataRow(vals) => { diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 5570781123..6e17f41f06 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -366,7 +366,7 @@ pub fn thread_main(conf: WalAcceptorConf) { info!("Starting wal acceptor on {}", conf.listen_addr); runtime.block_on(async { - let _unused = main_loop(&conf).await; + main_loop(&conf).await.unwrap(); }); } @@ -443,7 +443,7 @@ impl System { return shared_state.hs_feedback; } - // Load and lock control file (prevent running more than one instance of safekeeper + // Load and lock control file (prevent running more than one instance of safekeeper) fn load_control_file(&self, conf: &WalAcceptorConf) { let control_file_path = conf .data_dir @@ -678,6 +678,7 @@ impl Connection { // Add far as replication in postgres is initiated by receiver, we should use callme mechanism if let Err(e) = self.request_callback().await { // Do not treate it as fatal error and continue work + // FIXME: we should retry after a while... error!("Failed to send callme request to pageserver: {}", e); } @@ -893,11 +894,11 @@ impl Connection { ); BeMessage::write( &mut self.outbuf, - &BeMessage::DataRow(&[Some(lsn_bytes), Some(tli_bytes), Some(sysid_bytes), None]), + &BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]), ); BeMessage::write( &mut self.outbuf, - &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"), + &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"), ); BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); self.send().await?; From fa5d31056b00b3046a15e29b8aed6fcd34498702 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 Apr 2021 11:11:13 +0300 Subject: [PATCH 05/15] Remove unimplemented "snapshot" subcommand from --help --- vendor/postgres | 2 +- zenith/src/main.rs | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/vendor/postgres b/vendor/postgres index 167196910d..d143241a16 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 167196910d6f41466c82793bcf14bfe442468776 +Subproject commit d143241a1653d3825d94d645801c62c7755b1015 diff --git a/zenith/src/main.rs b/zenith/src/main.rs index f6690dd8d7..5e34c655b9 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -30,14 +30,6 @@ fn main() { .subcommand(SubCommand::with_name("stop").arg(name_arg.clone())) .subcommand(SubCommand::with_name("destroy").arg(name_arg.clone())), ) - .subcommand( - SubCommand::with_name("snapshot") - .about("Manage database snapshots") - .subcommand(SubCommand::with_name("create")) - .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) - .subcommand(SubCommand::with_name("destroy")), - ) .get_matches(); // handle init separately and exit From 2c5fb6d6c8a42c84ef845dcc8c1e6ffac224c7b6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Apr 2021 19:10:37 +0300 Subject: [PATCH 06/15] Change 'relsize_inc' signature to be a bit nicer. Don't add 1 to the argument in the function, the callers must do it now. And don't accept None argument, pass 0 instead for an empty relation. --- pageserver/src/page_cache.rs | 18 ++++++++++-------- pageserver/src/page_service.rs | 4 ++-- pageserver/src/restore_datadir.rs | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 1c05ea7e8f..db0a33b55b 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -597,17 +597,19 @@ impl PageCache { } } - // FIXME: Shouldn't relation size also be tracked with an LSN? - // If a replica is lagging behind, it needs to get the size as it was on - // the replica's current replay LSN. - pub fn relsize_inc(&self, rel: &RelTag, to: Option) { + /// Remember a relation's size in blocks. + /// + /// If 'to' is larger than the previously remembered size, the remembered size is increased to 'to'. + /// But if it's smaller, there is no change. + pub fn relsize_inc(&self, rel: &RelTag, to: u32) { + // FIXME: Shouldn't relation size also be tracked with an LSN? + // If a replica is lagging behind, it needs to get the size as it was on + // the replica's current replay LSN. let mut shared = self.shared.lock().unwrap(); let entry = shared.relsize_cache.entry(*rel).or_insert(0); - if let Some(to) = to { - if to >= *entry { - *entry = to + 1; - } + if to >= *entry { + *entry = to; } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f704990f5e..9ff0b2cf46 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -600,7 +600,7 @@ impl Connection { forknum: req.forknum, }; - pcache.relsize_inc(&tag, None); + pcache.relsize_inc(&tag, 0); self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, @@ -616,7 +616,7 @@ impl Connection { forknum: req.forknum, }; - pcache.relsize_inc(&tag, Some(req.blkno)); + pcache.relsize_inc(&tag, req.blkno + 1); self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: true, diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs index 985f5e3905..3b4f303bbc 100644 --- a/pageserver/src/restore_datadir.rs +++ b/pageserver/src/restore_datadir.rs @@ -333,7 +333,7 @@ async fn slurp_base_file( pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); - pcache.relsize_inc(&reltag, Some(blknum)); + pcache.relsize_inc(&reltag, blknum + 1); blknum += 1; } } From 3600b33f1cbe4114a020bb17389c814d4b3b7c24 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 Apr 2021 13:48:13 +0300 Subject: [PATCH 07/15] Implement "timelines" in page server This replaces the page server's "datadir" concept. The Page Server now always works with a "Zenith Repository". When you initialize a new repository with "zenith init", it runs initdb and loads an initial basebackup of the freshly-created cluster into the repository, on "main" branch. Repository can hold multiple "timelines", which can be given human-friendly names, making them "branches". One page server simultaneously serves all timelines stored in the repository, and you can have multiple Postgres compute nodes connected to the page server, as long they all operate on a different timeline. There is a new command "zenith branch", which can be used to fork off new branches from existing branches. The repository uses the directory layout desribed as Repository format v1 in https://github.com/zenithdb/rfcs/pull/5. It it *highly* inefficient: - we never create new snapshots. So in practice, it's really just a base backup of the initial empty cluster, and everything else is reconstructed by redoing all WAL - when you create a new timeline, the base snapshot and *all* WAL is copied from the new timeline to the new one. There is no smarts about referencing the old snapshots/wal from the ancestor timeline. To support all this, this commit includes a bunch of other changes: - Implement "basebackup" funtionality in page server. When you initialize a new compute node with "zenith pg create", it connects to the page server, and requests a base backup of the Postgres data directory on that timeline. (the base backup excludes user tables, so it's not as bad as it sounds). - Have page server's WAL receiver write the WAL into timeline dir. This allows running a Page Server and Compute Nodes without a WAL safekeeper, until we get around to integrate that properly into the system. (Even after we integrate WAL safekeeper, this is perhaps how this will operate when you want to run the system on your laptop.) - restore_datadir.rs was renamed to restore_local_repo.rs, and heavily modified to use the new format. It now also restores all WAL. - Page server no longer scans and restores everything into memory at startup. Instead, when the first request is made for a timeline, the timeline is slurped into memory at that point. - The responsibility for telling page server to "callmemaybe" was moved into Postgres libpqpagestore code. Also, WAL producer connstring cannot be specified in the pageserver's command line anymore. - Having multiple "system identifiers" in the same page server is no longer supported. I repurposed much of that code to support multiple timelines, instead. - Implemented very basic, incomplete, support for PostgreSQL's Extended Query Protocol in page_service.rs. Turns out that rust-postgres' copy_out() function always uses the extended query protocol to send out the command, and I'm using that to stream the base backup from the page server. TODO: I haven't fixed the WAL safekeeper for this scheme, so all the integration tests involving safekeepers are failing. My plan is to modify the safekeeper to know about Zenith timelines, too, and modify it to work with the same Zenith repository format. It only needs to care about the '.zenith/timelines//wal' directories. --- Cargo.lock | 315 +++++++++--- Cargo.toml | 1 + cli-v2-story.md | 188 +++++++ control_plane/Cargo.toml | 10 +- control_plane/src/compute.rs | 199 ++++---- control_plane/src/local_env.rs | 321 ++++++++---- control_plane/src/storage.rs | 141 +++--- integration_tests/Cargo.toml | 2 + integration_tests/tests/test_compute.rs | 4 + integration_tests/tests/test_control_plane.rs | 3 + integration_tests/tests/test_pageserver.rs | 88 ++-- integration_tests/tests/test_wal_acceptor.rs | 40 +- pageserver/Cargo.toml | 5 + pageserver/src/basebackup.rs | 197 ++++++++ pageserver/src/bin/cli/main.rs | 43 -- pageserver/src/bin/cli/pg.rs | 105 ---- pageserver/src/bin/cli/snapshot.rs | 27 - pageserver/src/bin/cli/storage.rs | 25 - pageserver/src/bin/cli/subcommand.rs | 29 -- pageserver/src/bin/pageserver.rs | 110 ++--- pageserver/src/lib.rs | 37 +- pageserver/src/page_cache.rs | 78 ++- pageserver/src/page_service.rs | 463 ++++++++++++++++-- pageserver/src/restore_datadir.rs | 339 ------------- pageserver/src/restore_local_repo.rs | 460 +++++++++++++++++ pageserver/src/waldecoder.rs | 97 ++-- pageserver/src/walreceiver.rs | 225 ++++++++- pageserver/src/walredo.rs | 26 +- vendor/postgres | 2 +- walkeeper/Cargo.toml | 3 + walkeeper/src/bin/wal_acceptor.rs | 15 +- walkeeper/src/lib.rs | 3 + walkeeper/src/wal_service.rs | 3 +- walkeeper/src/xlog_utils.rs | 6 +- zenith/Cargo.toml | 6 + zenith/src/main.rs | 261 ++++++++-- 36 files changed, 2697 insertions(+), 1180 deletions(-) create mode 100644 cli-v2-story.md create mode 100644 pageserver/src/basebackup.rs delete mode 100644 pageserver/src/bin/cli/main.rs delete mode 100644 pageserver/src/bin/cli/pg.rs delete mode 100644 pageserver/src/bin/cli/snapshot.rs delete mode 100644 pageserver/src/bin/cli/storage.rs delete mode 100644 pageserver/src/bin/cli/subcommand.rs delete mode 100644 pageserver/src/restore_datadir.rs create mode 100644 pageserver/src/restore_local_repo.rs diff --git a/Cargo.lock b/Cargo.lock index 85c299c7a8..ad2a518f22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0" [[package]] name = "async-trait" -version = "0.1.48" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea56748e10732c49404c153638a15ec3d6211ec5ff35d9bb20e13b93576adf" +checksum = "589652ce7ccb335d1e7ecb3be145425702b290dbcb7029bbeaae263fc1d87b48" dependencies = [ "proc-macro2", "quote", @@ -241,6 +241,30 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "bindgen" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5" +dependencies = [ + "bitflags", + "cexpr", + "cfg-if 0.1.10", + "clang-sys", + "clap", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + [[package]] name = "bitflags" version = "1.2.1" @@ -323,6 +347,15 @@ version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -348,6 +381,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "clang-sys" +version = "0.29.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.33.3" @@ -382,15 +426,22 @@ checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" name = "control_plane" version = "0.1.0" dependencies = [ - "home", + "anyhow", + "bytes", + "fs_extra", + "hex", "lazy_static", + "pageserver", "postgres", + "postgres_ffi", "rand 0.8.3", "regex", "serde", "serde_derive", + "tar", "tokio-postgres", "toml", + "walkeeper", ] [[package]] @@ -426,9 +477,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -543,6 +594,19 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "env_logger" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + [[package]] name = "event-listener" version = "2.5.1" @@ -564,6 +628,18 @@ dependencies = [ "instant", ] +[[package]] +name = "filetime" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall 0.2.6", + "winapi", +] + [[package]] name = "fnv" version = "1.0.7" @@ -606,10 +682,16 @@ dependencies = [ ] [[package]] -name = "futures" -version = "0.3.13" +name = "fs_extra" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "futures" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" dependencies = [ "futures-channel", "futures-core", @@ -622,9 +704,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" +checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" dependencies = [ "futures-core", "futures-sink", @@ -632,15 +714,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" +checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" [[package]] name = "futures-executor" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" +checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" dependencies = [ "futures-core", "futures-task", @@ -649,9 +731,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" +checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" [[package]] name = "futures-lite" @@ -670,9 +752,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" +checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -682,21 +764,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" +checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" [[package]] name = "futures-task" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" +checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" [[package]] name = "futures-util" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" +checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" dependencies = [ "futures-channel", "futures-core", @@ -744,6 +826,12 @@ dependencies = [ "wasi 0.10.0+wasi-snapshot-preview1", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "gloo-timers" version = "0.2.1" @@ -810,20 +898,11 @@ dependencies = [ "digest", ] -[[package]] -name = "home" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2456aef2e6b6a9784192ae780c0f15bc57df0e918585282325e8c8ac27737654" -dependencies = [ - "winapi", -] - [[package]] name = "http" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" dependencies = [ "bytes", "fnv", @@ -843,9 +922,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" +checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589" [[package]] name = "httpdate" @@ -853,6 +932,15 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +dependencies = [ + "quick-error", +] + [[package]] name = "hyper" version = "0.14.5" @@ -926,9 +1014,11 @@ version = "0.1.0" dependencies = [ "control_plane", "lazy_static", + "pageserver", "postgres", "rand 0.8.3", "tokio-postgres", + "walkeeper", ] [[package]] @@ -968,10 +1058,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "libc" -version = "0.2.92" +name = "lazycell" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56d855069fafbb9b344c0f962150cd2c1187975cb1c22c1522c240d8c4986714" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" + +[[package]] +name = "libloading" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" +dependencies = [ + "cc", + "winapi", +] [[package]] name = "lock_api" @@ -1088,6 +1194,16 @@ dependencies = [ "socket2", ] +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check", +] + [[package]] name = "ntapi" version = "0.3.6" @@ -1200,12 +1316,15 @@ dependencies = [ "crossbeam-channel", "daemonize", "fs2", + "fs_extra", "futures", + "hex", "lazy_static", "log", "postgres", "postgres-protocol", "postgres-types", + "postgres_ffi", "rand 0.8.3", "regex", "rust-s3", @@ -1214,6 +1333,7 @@ dependencies = [ "slog-scope", "slog-stdlog", "slog-term", + "tar", "termion", "thiserror", "tokio", @@ -1249,11 +1369,17 @@ dependencies = [ "cfg-if 1.0.0", "instant", "libc", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "smallvec", "winapi", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1369,6 +1495,20 @@ dependencies = [ "postgres-protocol", ] +[[package]] +name = "postgres_ffi" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "byteorder", + "bytes", + "chrono", + "crc32c", + "hex", + "rand 0.8.3", +] + [[package]] name = "ppv-lite86" version = "0.2.10" @@ -1396,6 +1536,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.9" @@ -1494,9 +1640,9 @@ checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" dependencies = [ "bitflags", ] @@ -1507,7 +1653,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8440d8acb4fd3d277125b4bd01a6f38aee8d814b3b5fc09b3f2b825d37d3fe8f" dependencies = [ - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1528,7 +1674,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ "getrandom 0.2.2", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1559,9 +1705,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" dependencies = [ "base64", "bytes", @@ -1645,6 +1791,12 @@ dependencies = [ "url", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.2.3" @@ -1794,6 +1946,12 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" + [[package]] name = "signal-hook-registry" version = "1.3.0" @@ -1914,9 +2072,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ce15dd3ed8aa2f8eeac4716d6ef5ab58b6b9256db41d7e1a0224c2788e8fd87" +checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb" dependencies = [ "proc-macro2", "quote", @@ -1929,6 +2087,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" +[[package]] +name = "tar" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0bcfbd6a598361fda270d82469fff3d65089dc33e175c9a131f7b4cd395f228" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.2.0" @@ -1938,7 +2107,7 @@ dependencies = [ "cfg-if 1.0.0", "libc", "rand 0.8.3", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "remove_dir_all", "winapi", ] @@ -1954,6 +2123,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + [[package]] name = "termion" version = "1.5.6" @@ -1962,7 +2140,7 @@ checksum = "077185e2eac69c3f8379a4298e1e07cd36beb962290d4a51199acf0fdc10607e" dependencies = [ "libc", "numtoa", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "redox_termios", ] @@ -2032,9 +2210,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134af885d758d645f0f0505c9a8b3f9bf8a348fd822e112ab5248138348f1722" +checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" dependencies = [ "autocfg", "bytes", @@ -2106,9 +2284,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f" +checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" dependencies = [ "bytes", "futures-core", @@ -2180,9 +2358,9 @@ checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" [[package]] name = "unicode-bidi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" dependencies = [ "matches", ] @@ -2292,6 +2470,7 @@ dependencies = [ "futures", "lazy_static", "log", + "pageserver", "postgres", "postgres-protocol", "rand 0.8.3", @@ -2418,6 +2597,15 @@ dependencies = [ "cc", ] +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +dependencies = [ + "libc", +] + [[package]] name = "wildmatch" version = "1.1.0" @@ -2464,6 +2652,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "xattr" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +dependencies = [ + "libc", +] + [[package]] name = "xml-rs" version = "0.8.3" @@ -2474,6 +2671,10 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" name = "zenith" version = "0.1.0" dependencies = [ + "anyhow", "clap", "control_plane", + "pageserver", + "postgres_ffi", + "walkeeper", ] diff --git a/Cargo.toml b/Cargo.toml index f4d6314283..3e9c59ce3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,5 @@ members = [ "walkeeper", "zenith", "control_plane", + "postgres_ffi", ] diff --git a/cli-v2-story.md b/cli-v2-story.md new file mode 100644 index 0000000000..1f213c903b --- /dev/null +++ b/cli-v2-story.md @@ -0,0 +1,188 @@ +Create a new Zenith repository in the current directory: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init + The files belonging to this database system will be owned by user "heikki". + This user must also own the server process. + + The database cluster will be initialized with locale "en_GB.UTF-8". + The default database encoding has accordingly been set to "UTF8". + The default text search configuration will be set to "english". + + Data page checksums are disabled. + + creating directory tmp ... ok + creating subdirectories ... ok + selecting dynamic shared memory implementation ... posix + selecting default max_connections ... 100 + selecting default shared_buffers ... 128MB + selecting default time zone ... Europe/Helsinki + creating configuration files ... ok + running bootstrap script ... ok + performing post-bootstrap initialization ... ok + syncing data to disk ... ok + + initdb: warning: enabling "trust" authentication for local connections + You can change this by editing pg_hba.conf or using the option -A, or + --auth-local and --auth-host, the next time you run initdb. + new zenith repository was created in .zenith + +Initially, there is only one branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + main + +Start a local Postgres instance on the branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432 + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432 + 2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432" + 2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status" + 2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0 + 2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required + 2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections + done + server started + +Run some commands against it: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" + CREATE TABLE + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + +Create a new branch called 'experimental'. We create it from the +current end of the 'main' branch, but you could specify a different +LSN as the start point instead. + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main + branching at end of WAL: 0/161F478 + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + experimental + main + +Start another Postgres instance off the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0 + 2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections + done + server started + +Insert some a row on the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + +See that the other Postgres instance is still running on 'main' branch on port 5432: + + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + + + +Everything is stored in the .zenith directory: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/ + total 12 + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines + +The 'datadirs' directory contains the datadirs of the running instances: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/ + total 8 + drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e + drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76 + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/ + total 124 + drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem + -rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf + -rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase + -rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION + lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact + -rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf + -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf + -rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts + -rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid + +Note how 'pg_wal' is just a symlink to the 'timelines' directory. The +datadir is ephemeral, you can delete it at any time, and it can be reconstructed +from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull +the repository, the 'datadirs' are not included. (They are like git working trees) + + ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres + ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/* + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0 + 2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections + done + server started + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 7281595c18..0d49488bd7 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,12 +8,20 @@ edition = "2018" [dependencies] rand = "0.8.3" +tar = "0.4.33" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } serde = "" serde_derive = "" toml = "" -home = "0.5.3" lazy_static = "" regex = "1" +anyhow = "1.0" +hex = "0.4.3" +bytes = "1.0.1" +fs_extra = "1.2.0" + +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 5c3ec5e816..8157c62a8b 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,21 +1,24 @@ -use std::error; use std::fs::File; use std::fs::{self, OpenOptions}; +use std::os::unix::fs::PermissionsExt; use std::net::TcpStream; -use std::process::{Command, Stdio}; +use std::process::Command; use std::sync::Arc; use std::time::Duration; use std::{collections::BTreeMap, path::PathBuf}; -use std::{io::Write, net::SocketAddr}; +use std::io::{Read, Write}; +use std::net::SocketAddr; -use lazy_static::lazy_static; -use postgres::{Client, NoTls}; use regex::Regex; +use lazy_static::lazy_static; +use tar; +use anyhow::{Context, Result}; -use crate::local_env::{self, LocalEnv}; +use postgres::{Client, NoTls}; + +use crate::local_env::LocalEnv; use crate::storage::{PageServerNode, WalProposerNode}; - -type Result = std::result::Result>; +use pageserver::ZTimelineId; // // ComputeControlPlane @@ -34,14 +37,9 @@ impl ComputeControlPlane { // it is running on default port. Change that when pageserver will have config. let pageserver = Arc::new(PageServerNode::from_env(&env)); - let nodes: Result> = fs::read_dir(env.compute_dir()) - .map_err(|e| { - format!( - "failed to list {}: {}", - env.compute_dir().to_str().unwrap(), - e - ) - })? + let pgdatadirspath = env.repo_path.join("pgdatadirs"); + let nodes: Result> = fs::read_dir(&pgdatadirspath) + .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? .into_iter() .map(|f| { PostgresNode::from_dir_entry(f?, &env, &pageserver) @@ -67,43 +65,46 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - pub fn local(pageserver: &Arc) -> ComputeControlPlane { - let env = local_env::test_env(); + pub fn local(local_env: &LocalEnv, pageserver: &Arc) -> ComputeControlPlane { ComputeControlPlane { base_port: 65431, pageserver: Arc::clone(pageserver), nodes: BTreeMap::new(), - env, + env: local_env.clone(), } } - fn new_vanilla_node(&mut self, is_test: bool) -> Result> { - // allocate new node entry with generated port + // Connect to a page server, get base backup, and untar it to initialize a + // new data directory + pub fn new_from_page_server(&mut self, is_test: bool, timelineid: ZTimelineId) -> Result> { let node_id = self.nodes.len() as u32 + 1; + let node = Arc::new(PostgresNode { name: format!("pg{}", node_id), address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test, + timelineid }); - node.init_vanilla()?; + + node.init_from_page_server()?; self.nodes.insert(node.name.clone(), Arc::clone(&node)); Ok(node) } - pub fn new_test_node(&mut self) -> Arc { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(true).unwrap(); + pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid); + assert!(node.is_ok()); + let node = node.unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -111,9 +112,9 @@ impl ComputeControlPlane { node } - pub fn new_test_master_node(&mut self) -> Arc { - let node = self.new_vanilla_node(true).unwrap(); - println!("Create vanilla node at {:?}", node.address); + pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid).unwrap(); + node.append_conf( "postgresql.conf", "synchronous_standby_names = 'safekeeper_proxy'\n", @@ -122,17 +123,15 @@ impl ComputeControlPlane { node } - pub fn new_node(&mut self) -> Result> { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(false)?; + pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result> { + let node = self.new_from_page_server(false, timelineid).unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -149,6 +148,7 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, + timelineid: ZTimelineId, } impl PostgresNode { @@ -158,11 +158,8 @@ impl PostgresNode { pageserver: &Arc, ) -> Result { if !entry.file_type()?.is_dir() { - let err_msg = format!( - "PostgresNode::from_dir_entry failed: '{}' is not a directory", - entry.path().to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("PostgresNode::from_dir_entry failed: '{}' is not a directory", + entry.path().display()); } lazy_static! { @@ -175,13 +172,9 @@ impl PostgresNode { // find out tcp port in config file let cfg_path = entry.path().join("postgresql.conf"); - let config = fs::read_to_string(cfg_path.clone()).map_err(|e| { - format!( - "failed to read config file in {}: {}", - cfg_path.to_str().unwrap(), - e - ) - })?; + let config = fs::read_to_string(cfg_path.clone()) + .with_context(|| format!("failed to read config file in {}", + cfg_path.to_str().unwrap()))?; let err_msg = format!( "failed to find port definition in config file {}", @@ -189,14 +182,20 @@ impl PostgresNode { ); let port: u16 = CONF_PORT_RE .captures(config.as_str()) - .ok_or(err_msg.clone() + " 1")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))? .iter() .last() - .ok_or(err_msg.clone() + " 3")? - .ok_or(err_msg.clone() + " 3")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))? .as_str() .parse() - .map_err(|e| format!("{}: {}", err_msg, e))?; + .with_context(|| err_msg)?; + + // FIXME: What timeline is this server on? Would have to parse the postgresql.conf + // file for that, too. It's currently not needed for anything, but it would be + // nice to list the timeline in "zenith pg list" + let timelineid_buf = [0u8; 16]; + let timelineid = ZTimelineId::from(timelineid_buf); // ok now Ok(PostgresNode { @@ -205,38 +204,48 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, + timelineid }) } - fn init_vanilla(&self) -> Result<()> { + // Connect to a page server, get base backup, and untar it to initialize a + // new data directory + pub fn init_from_page_server(&self) -> Result<()> { + + let pgdata = self.pgdata(); + println!( - "Creating new postgres: path={} port={}", - self.pgdata().to_str().unwrap(), + "Extracting base backup to create postgres instance: path={} port={}", + pgdata.to_str().unwrap(), self.address.port() ); // initialize data directory - if self.is_test { - fs::remove_dir_all(self.pgdata().to_str().unwrap()).ok(); + fs::remove_dir_all(&pgdata).ok(); } - fs::create_dir_all(self.pgdata().to_str().unwrap())?; + let sql = format!("basebackup {}", self.timelineid); + let mut client = self.pageserver.page_server_psql_client()?; + println!("connected to page server"); - let initdb_path = self.env.pg_bin_dir().join("initdb"); - let initdb = Command::new(initdb_path) - .args(&["-D", self.pgdata().to_str().unwrap()]) - .arg("-N") - .arg("-A trust") - .arg("--no-instructions") - .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .status()?; + fs::create_dir_all(&pgdata)?; + fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).unwrap(); - if !initdb.success() { - return Err("initdb failed".into()); - } + // Also create pg_wal directory, it's not included in the tarball + // FIXME: actually, it is currently. + //fs::create_dir_all(pgdata.join("pg_wal"))?; + + let mut copyreader = client.copy_out(sql.as_str())?; + + // FIXME: Currently, we slurp the whole tarball into memory, and then extract it, + // but we really should do this: + //let mut ar = tar::Archive::new(copyreader); + let mut buf = vec![]; + copyreader.read_to_end(&mut buf)?; + println!("got tarball of size {}", buf.len()); + let mut ar = tar::Archive::new(buf.as_slice()); + ar.unpack(&pgdata)?; // listen for selected port self.append_conf( @@ -256,12 +265,33 @@ impl PostgresNode { .as_str(), ); - println!("Database initialized"); + // Never clean up old WAL. TODO: We should use a replication + // slot or something proper, to prevent the compute node + // from removing WAL that hasn't been streamed to the safekeepr or + // page server yet. But this will do for now. + self.append_conf("postgresql.conf", + format!("wal_keep_size='10TB'\n") + .as_str(), + ); + + // Connect it to the page server. + + // Configure that node to take pages from pageserver + self.append_conf("postgresql.conf", + format!("page_server_connstring = 'host={} port={}'\n\ + zenith_timeline='{}'\n", + self.pageserver.address().ip(), + self.pageserver.address().port(), + self.timelineid, + ) + .as_str(), + ); + Ok(()) } - pub fn pgdata(&self) -> PathBuf { - self.env.compute_dir().join(self.name.clone()) + fn pgdata(&self) -> PathBuf { + self.env.repo_path.join("pgdatadirs").join(&self.name) } pub fn status(&self) -> &str { @@ -306,16 +336,13 @@ impl PostgresNode { .status()?; if !pg_ctl.success() { - Err("pg_ctl failed".into()) + anyhow::bail!("pg_ctl failed"); } else { Ok(()) } } pub fn start(&self) -> Result<()> { - let _res = self - .pageserver - .page_server_psql(format!("callmemaybe {}", self.connstr()).as_str()); println!("Starting postgres node at '{}'", self.connstr()); self.pg_ctl(&["start"]) } @@ -405,12 +432,10 @@ impl PostgresNode { .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") - .stderr( - OpenOptions::new() - .append(true) - .open(self.env.data_dir.join("safepkeeper_proxy.log")) - .unwrap(), - ) + .stderr(OpenOptions::new() + .append(true) + .open(self.env.repo_path.join("safepkeeper_proxy.log")) + .unwrap()) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 241fba2f62..ebbcba7f26 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,13 +5,18 @@ // script which will use local paths. // use std::env; -use std::error; use std::fs; use std::path::{Path, PathBuf}; +use std::process::Command; +use bytes::Bytes; +use rand::Rng; +use hex; use serde_derive::{Deserialize, Serialize}; +use anyhow::Result; -type Result = std::result::Result>; +use walkeeper::xlog_utils; +use pageserver::ZTimelineId; // // This data structure represents deserialized zenith config, which should be @@ -21,11 +26,11 @@ type Result = std::result::Result>; // #[derive(Serialize, Deserialize, Clone)] pub struct LocalEnv { - // Here page server and compute nodes will create and store their data. - pub data_dir: PathBuf, + // Path to the Repository. Here page server and compute nodes will create and store their data. + pub repo_path: PathBuf, - // Path to postgres distribution. It expected that "bin", "include", - // "lib", "share" from postgres distribution will be there. If at some point + // Path to postgres distribution. It's expected that "bin", "include", + // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. pub pg_distrib_dir: PathBuf, @@ -42,53 +47,33 @@ impl LocalEnv { pub fn pg_lib_dir(&self) -> PathBuf { self.pg_distrib_dir.join("lib") } +} - // pageserver - pub fn pageserver_data_dir(&self) -> PathBuf { - self.data_dir.join("pageserver") - } - pub fn pageserver_log(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.log") - } - pub fn pageserver_pidfile(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.pid") - } - - // compute nodes - pub fn compute_dir(&self) -> PathBuf { - self.data_dir.join("compute") +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), } } // -// Issues in rust-lang repo has several discussions about proper library to check -// home directory in a cross-platform way. Seems that current consensus is around -// home crate and cargo uses it. +// Initialize a new Zenith repository // -fn get_home() -> Result { - home::home_dir().ok_or("can not determine home directory path".into()) -} - pub fn init() -> Result<()> { - let home_dir = get_home()?; - // check if config already exists - let cfg_path = home_dir.join(".zenith"); - if cfg_path.exists() { - let err_msg = format!( - "{} already exists. Perhaps already initialized?", - cfg_path.to_str().unwrap() - ); - return Err(err_msg.into()); + let repo_path = PathBuf::from(zenith_repo_dir()); + if repo_path.exists() { + anyhow::bail!("{} already exists. Perhaps already initialized?", + repo_path.to_str().unwrap()); } // Now we can run init only from crate directory, so check that current dir is our crate. // Use 'pageserver/Cargo.toml' existence as evidendce. let cargo_path = env::current_dir()?; if !cargo_path.join("pageserver/Cargo.toml").exists() { - let err_msg = "Current dirrectory does not look like a zenith repo. \ - Please, run 'init' from zenith repo root."; - return Err(err_msg.into()); + anyhow::bail!("Current dirrectory does not look like a zenith repo. \ + Please, run 'init' from zenith repo root."); } // ok, now check that expected binaries are present @@ -97,81 +82,145 @@ pub fn init() -> Result<()> { let pg_distrib_dir = cargo_path.join("tmp_install"); let pg_path = pg_distrib_dir.join("bin/postgres"); if !pg_path.exists() { - let err_msg = format!( - "Can't find postres binary at {}. \ - Perhaps './pgbuild.sh' is needed to build it first.", - pg_path.to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("Can't find postres binary at {}. \ + Perhaps './pgbuild.sh' is needed to build it first.", + pg_path.to_str().unwrap()); } // check pageserver let zenith_distrib_dir = cargo_path.join("target/debug/"); let pageserver_path = zenith_distrib_dir.join("pageserver"); if !pageserver_path.exists() { - let err_msg = format!( - "Can't find pageserver binary at {}. Please build it.", - pageserver_path.to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("Can't find pageserver binary at {}. Please build it.", + pageserver_path.to_str().unwrap()); } // ok, we are good to go - - // create dirs - let data_dir = cargo_path.join("tmp_check_cli"); - - for &dir in &["compute", "pageserver"] { - fs::create_dir_all(data_dir.join(dir)).map_err(|e| { - format!( - "Failed to create directory in '{}': {}", - data_dir.to_str().unwrap(), - e - ) - })?; - } - - // write config let conf = LocalEnv { - data_dir, + repo_path: repo_path.clone(), pg_distrib_dir, zenith_distrib_dir, }; + init_repo(&conf)?; + + // write config let toml = toml::to_string(&conf)?; - fs::write(cfg_path, toml)?; + fs::write(repo_path.join("config"), toml)?; + + Ok(()) +} + +pub fn init_repo(local_env: &LocalEnv) -> Result<()> +{ + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + fs::create_dir(&repopath)?; + fs::create_dir(repopath.clone() + "/pgdatadirs")?; + fs::create_dir(repopath.clone() + "/timelines")?; + fs::create_dir(repopath.clone() + "/refs")?; + fs::create_dir(repopath.clone() + "/refs/branches")?; + fs::create_dir(repopath.clone() + "/refs/tags")?; + + // Create empty config file + let configpath = repopath.clone() + "/config"; + fs::write(&configpath, r##" +# Example config file. Nothing here yet. +"##) + .expect(&format!("Unable to write file {}", &configpath)); + + // Create initial timeline + let tli = create_timeline(&local_env, None)?; + let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + + // Run initdb + // + // FIXME: we create it temporarily in "tmp" directory, and move it into + // the repository. Use "tempdir()" or something? Or just create it directly + // in the repo? + let initdb_path = local_env.pg_bin_dir().join("initdb"); + let _initdb = + Command::new(initdb_path) + .args(&["-D", "tmp", "--no-instructions"]) + .status() + .expect("failed to execute initdb"); + + // Read control file to extract the LSN + let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; + + let lsn = controlfile.checkPoint; + let lsnstr = format!("{:016X}", lsn); + + // Move the initial WAL file + fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.clone() + "/wal/000000010000000000000001.partial")?; + + // Remove pg_wal + fs::remove_dir_all("tmp/pg_wal")?; + + force_crash_recovery(&PathBuf::from("tmp"))?; + + let target = timelinedir.clone() + "/snapshots/" + &lsnstr; + fs::rename("tmp", target)?; + + // Create 'main' branch to refer to the initial timeline + let data = hex::encode(tli); + fs::write(repopath.clone() + "/refs/branches/main", data)?; + + println!("new zenith repository was created in {}", &repopath); + Ok(()) +} + + +// If control file says the cluster was shut down cleanly, modify it, to mark +// it as crashed. That forces crash recovery when you start the cluster. +// +// FIXME: +// We currently do this to the initial snapshot in "zenith init". It would +// be more natural to do this when the snapshot is restored instead, but we +// currently don't have any code to create new snapshots, so it doesn't matter +// Or better yet, use a less hacky way of putting the cluster into recovery. +// Perhaps create a backup label file in the data directory when it's restored. +fn force_crash_recovery(datadir: &Path) -> Result<()> { + + // Read in the control file + let mut controlfilepath = datadir.to_path_buf(); + controlfilepath.push("global"); + controlfilepath.push("pg_control"); + let mut controlfile = postgres_ffi::decode_pg_control( + Bytes::from(fs::read(controlfilepath.as_path())?))?; + + controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION; + + fs::write(controlfilepath.as_path(), + postgres_ffi::encode_pg_control(controlfile))?; Ok(()) } // check that config file is present -pub fn load_config() -> Result { - // home - let home_dir = get_home()?; - - // check file exists - let cfg_path = home_dir.join(".zenith"); - if !cfg_path.exists() { - let err_msg = format!( - "Zenith config is not found in {}. You need to run 'zenith init' first", - cfg_path.to_str().unwrap() - ); - return Err(err_msg.into()); +pub fn load_config(repopath: &Path) -> Result { + if !repopath.exists() { + anyhow::bail!("Zenith config is not found in {}. You need to run 'zenith init' first", + repopath.to_str().unwrap()); } // load and parse file - let config = fs::read_to_string(cfg_path)?; + let config = fs::read_to_string(repopath.join("config"))?; toml::from_str(config.as_str()).map_err(|e| e.into()) } // local env for tests -pub fn test_env() -> LocalEnv { - let data_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check"); - fs::create_dir_all(data_dir.clone()).unwrap(); - LocalEnv { - data_dir, +pub fn test_env(testname: &str) -> LocalEnv { + let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check/").join(testname); + + // Remove remnants of old test repo + let _ = fs::remove_dir_all(&repo_path); + + let local_env = LocalEnv { + repo_path, pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"), zenith_distrib_dir: cargo_bin_dir(), - } + }; + init_repo(&local_env).unwrap(); + return local_env; } // Find the directory where the binaries were put (i.e. target/debug/) @@ -185,3 +234,103 @@ pub fn cargo_bin_dir() -> PathBuf { return pathbuf; } + +#[derive(Debug, Clone, Copy)] +pub struct PointInTime { + pub timelineid: ZTimelineId, + pub lsn: u64 +} + +fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result<[u8; 16]> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + // Create initial timeline + let mut tli = [0u8; 16]; + rand::thread_rng().fill(&mut tli); + + let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + + fs::create_dir(timelinedir.clone())?; + fs::create_dir(timelinedir.clone() + "/snapshots")?; + fs::create_dir(timelinedir.clone() + "/wal")?; + + if let Some(ancestor) = ancestor { + let data = format!("{}@{:X}/{:X}", + hex::encode(ancestor.timelineid.to_str()), + ancestor.lsn >> 32, + ancestor.lsn & 0xffffffff); + fs::write(timelinedir + "/ancestor", data)?; + } + + Ok(tli) +} + +// Parse an LSN in the format used in filenames +// +// For example: 00000000015D3DD8 +// +fn parse_lsn(s: &str) -> std::result::Result { + u64::from_str_radix(s, 16) +} + +// Create a new branch in the repository (for the "zenith branch" subcommand) +pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointInTime) -> Result<()> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + // create a new timeline for it + let newtli = create_timeline(local_env, Some(startpoint))?; + let newtimelinedir = format!("{}/timelines/{}", repopath, &hex::encode(newtli)); + + let data = hex::encode(newtli); + fs::write(format!("{}/refs/branches/{}", repopath, branchname), data)?; + + // Copy the latest snapshot (TODO: before the startpoint) and all WAL + // TODO: be smarter and avoid the copying... + let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?; + let copy_opts = fs_extra::dir::CopyOptions::new(); + fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.clone() + "/snapshots", ©_opts)?; + + let oldtimelinedir = format!("{}/timelines/{}", &repopath, startpoint.timelineid.to_str()); + let mut copy_opts = fs_extra::dir::CopyOptions::new(); + copy_opts.content_only = true; + fs_extra::dir::copy(oldtimelinedir + "/wal/", + newtimelinedir.clone() + "/wal", + ©_opts)?; + + Ok(()) +} + +// Find the end of valid WAL in a wal directory +pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + let waldir = PathBuf::from(format!("{}/timelines/{}/wal", repopath, timeline.to_str())); + + let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true); + + return Ok(lsn); +} + +// Find the latest snapshot for a timeline +fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + let timelinedir = repopath + "/timelines/" + &timeline.to_str(); + let snapshotsdir = timelinedir.clone() + "/snapshots"; + let paths = fs::read_dir(&snapshotsdir).unwrap(); + let mut maxsnapshot: u64 = 0; + let mut snapshotdir: Option = None; + for path in paths { + let path = path.unwrap(); + let filename = path.file_name().to_str().unwrap().to_owned(); + if let Ok(lsn) = parse_lsn(&filename) { + maxsnapshot = std::cmp::max(lsn, maxsnapshot); + snapshotdir = Some(path.path()); + } + } + if maxsnapshot == 0 { + // TODO: check ancestor timeline + anyhow::bail!("no snapshot found in {}", snapshotsdir); + } + + Ok((maxsnapshot, snapshotdir.unwrap())) +} diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index eba2966849..dd935cb4fb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,4 +1,3 @@ -use std::error; use std::fs; use std::io; use std::net::SocketAddr; @@ -9,13 +8,13 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; +use anyhow::Result; use postgres::{Client, NoTls}; +use crate::local_env::LocalEnv; use crate::compute::PostgresNode; -use crate::local_env::{self, LocalEnv}; - -type Result = std::result::Result>; +use pageserver::ZTimelineId; // // Collection of several example deployments useful for tests. @@ -27,63 +26,72 @@ pub struct TestStorageControlPlane { pub wal_acceptors: Vec, pub pageserver: Arc, pub test_done: AtomicBool, + pub repopath: PathBuf, } impl TestStorageControlPlane { + + // Peek into the repository, to grab the timeline ID of given branch + pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId { + + let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() + } + // postgres <-> page_server - pub fn one_page_server(pgdata_base_path: String) -> TestStorageControlPlane { - let env = local_env::test_env(); + // + // Initialize a new repository and configure a page server to run in it + // + pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); - - if pgdata_base_path.is_empty() { - pserver.start().unwrap(); - } else { - pserver.start_fromdatadir(pgdata_base_path).unwrap(); - } + pserver.start().unwrap(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } - pub fn one_page_server_no_start() -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env, + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } // postgres <-> {wal_acceptor1, wal_acceptor2, ...} - pub fn fault_tolerant(redundancy: usize) -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); + let mut cplane = TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }), test_done: AtomicBool::new(false), + repopath: repopath, }; - cplane.pageserver.init(); cplane.pageserver.start().unwrap(); const WAL_ACCEPTOR_PORT: usize = 54321; @@ -93,8 +101,8 @@ impl TestStorageControlPlane { listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i) .parse() .unwrap(), - data_dir: env.data_dir.join(format!("wal_acceptor_{}", i)), - env: env.clone(), + data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)), + env: local_env.clone(), }; wal_acceptor.init(); wal_acceptor.start(); @@ -153,58 +161,46 @@ impl PageServerNode { } } - pub fn init(&self) { - fs::create_dir_all(self.env.pageserver_data_dir()).unwrap(); + pub fn repo_path(&self) -> PathBuf { + self.env.repo_path.clone() + } + + pub fn pid_file(&self) -> PathBuf { + self.env.repo_path.join("pageserver.pid") } pub fn start(&self) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); + println!("Starting pageserver at '{}' in {}", self.address(), self.repo_path().display()); - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) + let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver")); + cmd .args(&["-l", self.address().to_string().as_str()]) .arg("-d") .env_clear() + .env("ZENITH_REPO_DIR", self.repo_path()) .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status()?; + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); - if !status.success() { - return Err(Box::::from(format!( - "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + if !cmd.status()?.success() { + anyhow::bail!("Pageserver failed to start. See '{}' for details.", + self.repo_path().join("pageserver.log").display()); } - } - pub fn start_fromdatadir(&self, pgdata_base_path: String) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); - - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) - .arg("-d") - .args(&["--restore-from", "local"]) - .env_clear() - .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("PGDATA_BASE_PATH", pgdata_base_path) - .status()?; - - if !status.success() { - return Err(Box::::from(format!( - "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + // It takes a while for the page server to start up. Wait until it is + // open for business. + for retries in 1..15 { + let client = self.page_server_psql_client(); + if client.is_ok() { + break; + } else { + println!("page server not responding yet, retrying ({})...", retries); + thread::sleep(Duration::from_secs(1)); + } } + Ok(()) } pub fn stop(&self) -> Result<()> { - let pidfile = self.env.pageserver_pidfile(); + let pidfile = self.pid_file(); let pid = read_pidfile(&pidfile)?; let status = Command::new("kill") @@ -214,10 +210,7 @@ impl PageServerNode { .expect("failed to execute kill"); if !status.success() { - return Err(Box::::from(format!( - "Failed to kill pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to kill pageserver with pid {}", pid); } // await for pageserver stop @@ -232,10 +225,7 @@ impl PageServerNode { // ok, we failed to stop pageserver, let's panic if !status.success() { - return Err(Box::::from(format!( - "Failed to stop pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to stop pageserver with pid {}", pid); } else { return Ok(()); } @@ -254,6 +244,17 @@ impl PageServerNode { println!("Pageserver query: '{}'", sql); client.simple_query(sql).unwrap() } + + pub fn page_server_psql_client(&self) -> std::result::Result { + let connstring = format!( + "host={} port={} dbname={} user={}", + self.address().ip(), + self.address().port(), + "no_db", + "no_user", + ); + Client::connect(connstring.as_str(), NoTls) + } } impl Drop for PageServerNode { diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml index b201b1849e..51f9d0c773 100644 --- a/integration_tests/Cargo.toml +++ b/integration_tests/Cargo.toml @@ -12,4 +12,6 @@ rand = "0.8.3" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } diff --git a/integration_tests/tests/test_compute.rs b/integration_tests/tests/test_compute.rs index 955b7ffa5e..f4cf38432e 100644 --- a/integration_tests/tests/test_compute.rs +++ b/integration_tests/tests/test_compute.rs @@ -1,7 +1,11 @@ // test node resettlement to an empty datadir + +// TODO +/* #[test] fn test_resettlement() {} // test seq scan of everythin after restart #[test] fn test_cold_seqscan() {} +*/ diff --git a/integration_tests/tests/test_control_plane.rs b/integration_tests/tests/test_control_plane.rs index 481cd3d8b3..8724d5fda1 100644 --- a/integration_tests/tests/test_control_plane.rs +++ b/integration_tests/tests/test_control_plane.rs @@ -1,5 +1,8 @@ +// TODO +/* #[test] fn test_actions() {} #[test] fn test_regress() {} +*/ diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index c0959ebdbb..14c328be0e 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -1,23 +1,24 @@ // mod control_plane; use control_plane::compute::ComputeControlPlane; use control_plane::storage::TestStorageControlPlane; - -use std::thread::sleep; -use std::time::Duration; +use control_plane::local_env; +use control_plane::local_env::PointInTime; // XXX: force all redo at the end // -- restart + seqscan won't read deleted stuff // -- pageserver api endpoint to check all rels - -// Handcrafted cases with wal records that are (were) problematic for redo. +/* #[test] fn test_redo_cases() { + let local_env = local_env::test_env("test_redo_cases"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); // check basic work with table @@ -47,32 +48,46 @@ fn test_redo_cases() { println!("sum = {}", count); assert_eq!(count, 5000050000); } - +*/ // Runs pg_regress on a compute node #[test] #[ignore] fn test_regress() { + let local_env = local_env::test_env("test_regress"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); control_plane::storage::regress_check(&node); } -// Run two postgres instances on one pageserver +// Run two postgres instances on one pageserver, on different timelines #[test] -fn test_pageserver_multitenancy() { - // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); +fn test_pageserver_two_timelines() { + let local_env = local_env::test_env("test_pageserver_two_timelines"); - // Allocate postgres instance, but don't start - let node1 = compute_cplane.new_test_node(); - let node2 = compute_cplane.new_test_node(); + // Start pageserver that reads WAL directly from that postgres + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); + + let maintli = storage_cplane.get_branch_timeline("main"); + + // Create new branch at the end of 'main' + let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); + local_env::create_branch(&local_env, "experimental", + PointInTime { timelineid: maintli, + lsn: startpoint }).unwrap(); + let experimentaltli = storage_cplane.get_branch_timeline("experimental"); + + // Launch postgres instances on both branches + let node1 = compute_cplane.new_test_node(maintli); + let node2 = compute_cplane.new_test_node(experimentaltli); node1.start().unwrap(); node2.start().unwrap(); @@ -110,36 +125,3 @@ fn test_pageserver_multitenancy() { println!("sum = {}", count); assert_eq!(count, 15000150000); } - -#[test] -fn test_upload_pageserver_local() { - // Init pageserver that reads WAL directly from that postgres - // Don't start yet - - let storage_cplane = TestStorageControlPlane::one_page_server_no_start(); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); - - // init postgres node - let node = compute_cplane.new_test_node(); - - //upload data to pageserver & start it - &storage_cplane - .pageserver - .start_fromdatadir(node.pgdata().to_str().unwrap().to_string()) - .unwrap(); - - sleep(Duration::from_secs(10)); - - // start postgres node - node.start().unwrap(); - - // check basic work with table - node.safe_psql( - "postgres", - "CREATE TABLE t(key int primary key, value text)", - ); - node.safe_psql( - "postgres", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ); -} diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index 408f991bb2..316a098afe 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -1,6 +1,7 @@ // Restart acceptors one by one while compute is under the load. use control_plane::compute::ComputeControlPlane; use control_plane::storage::TestStorageControlPlane; +use control_plane::local_env; use rand::Rng; use std::sync::Arc; @@ -9,14 +10,16 @@ use std::{thread, time}; #[test] fn test_acceptors_normal_work() { - // Start pageserver that reads WAL directly from that postgres + let local_env = local_env::test_env("test_acceptors_normal_work"); + const REDUNDANCY: usize = 3; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -91,17 +94,20 @@ fn test_multitenancy() { // Majority is always alive #[test] fn test_acceptors_restarts() { + let local_env = local_env::test_env("test_acceptors_restarts"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; const FAULT_PROBABILITY: f32 = 0.01; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); let mut rng = rand::thread_rng(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -150,16 +156,19 @@ fn start_acceptor(cplane: &Arc, no: usize) { // them again and check that nothing was losed. Repeat. // N_CRASHES env var #[test] -fn test_acceptors_unavalability() { +fn test_acceptors_unavailability() { + let local_env = local_env::test_env("test_acceptors_unavailability"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 2; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -226,15 +235,18 @@ fn simulate_failures(cplane: Arc) { // Race condition test #[test] fn test_race_conditions() { + let local_env = local_env::test_env("test_race_conditions"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; - let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(REDUNDANCY)); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY)); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f7f3be7f47..69f6ce61ab 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -14,6 +14,7 @@ regex = "1.4.5" bytes = "1.0.1" byteorder = "1.4.3" fs2 = "0.4.3" +fs_extra = "1.2.0" futures = "0.3.13" lazy_static = "1.4.0" slog-stdlog = "4.1.0" @@ -37,3 +38,7 @@ anyhow = "1.0" crc32c = "0.6.0" walkdir = "2" thiserror = "1.0" +hex = "0.4.3" +tar = "0.4.33" + +postgres_ffi = { path = "../postgres_ffi" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs new file mode 100644 index 0000000000..76ca3c3377 --- /dev/null +++ b/pageserver/src/basebackup.rs @@ -0,0 +1,197 @@ +use log::*; +use tar::{Builder}; +use std::fmt; +use std::io::Write; +use walkdir::WalkDir; +use regex::Regex; + +use crate::ZTimelineId; + + +pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, snapshotlsn: u64) -> Result<(), std::io::Error> { + let mut ar = Builder::new(write); + + let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn); + let walpath = format!("timelines/{}/wal", timelineid); + + debug!("sending tarball of snapshot in {}", snappath); + //ar.append_dir_all("", &snappath)?; + + for entry in WalkDir::new(&snappath) { + let entry = entry?; + let fullpath = entry.path(); + let relpath = entry.path().strip_prefix(&snappath).unwrap(); + + if relpath.to_str().unwrap() == "" { + continue; + } + + if entry.file_type().is_dir() { + trace!("sending dir {} as {}", fullpath.display(), relpath.display()); + ar.append_dir(relpath, fullpath)?; + } else if entry.file_type().is_symlink() { + error!("ignoring symlink in snapshot dir"); + } else if entry.file_type().is_file() { + + // Shared catalogs are exempt + if relpath.starts_with("global/") { + trace!("sending shared catalog {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else if !is_rel_file_path(relpath.to_str().unwrap()) { + trace!("sending {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else { + trace!("not sending {}", relpath.display()); + // FIXME: send all files for now + ar.append_path_with_name(fullpath, relpath)?; + } + } else { + error!("unknown file type: {}", fullpath.display()); + } + } + + // FIXME: also send all the WAL + for entry in std::fs::read_dir(&walpath)? { + let entry = entry?; + let fullpath = &entry.path(); + let relpath = fullpath.strip_prefix(&walpath).unwrap(); + + if !entry.path().is_file() { + continue; + } + + let archive_fname = relpath.to_str().unwrap().clone(); + let archive_fname = archive_fname.strip_suffix(".partial").unwrap_or(&archive_fname); + let archive_path = "pg_wal/".to_owned() + archive_fname; + ar.append_path_with_name(fullpath, archive_path)?; + } + + ar.finish()?; + debug!("all tarred up!"); + Ok(()) +} + + +// formats: +// +// _ +// . +// _. + + +#[derive(Debug)] +struct FilePathError { + msg: String, +} + +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} + + +fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> { + /* + * Relation data files can be in one of the following directories: + * + * global/ + * shared relations + * + * base// + * regular relations, default tablespace + * + * pg_tblspc/// + * within a non-default tablespace (the name of the directory + * depends on version) + * + * And the relation data files themselves have a filename like: + * + * . + */ + if let Some(fname) = path.strip_prefix("global/") { + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(dbpath) = path.strip_prefix("base/") { + let mut s = dbpath.split("/"); + let dbnode_str = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + let _dbnode = u32::from_str_radix(dbnode_str, 10)?; + let fname = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + if s.next().is_some() { + return Err(FilePathError::new("invalid relation data file name")); + }; + + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(_) = path.strip_prefix("pg_tblspc/") { + // TODO + return Err(FilePathError::new("tablespaces not supported")); + } else { + return Err(FilePathError::new("invalid relation data file name")); + } +} + +fn is_rel_file_path(path: &str) -> bool { + return parse_rel_file_path(path).is_ok(); +} diff --git a/pageserver/src/bin/cli/main.rs b/pageserver/src/bin/cli/main.rs deleted file mode 100644 index 4aa3269c09..0000000000 --- a/pageserver/src/bin/cli/main.rs +++ /dev/null @@ -1,43 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -pub mod pg; -pub mod snapshot; -pub mod storage; -mod subcommand; - -fn main() -> Result<()> { - let cli_commands = subcommand::ClapCommands { - commands: vec![ - Box::new(pg::PgCmd { - clap_cmd: clap::SubCommand::with_name("pg"), - }), - Box::new(storage::StorageCmd { - clap_cmd: clap::SubCommand::with_name("storage"), - }), - Box::new(snapshot::SnapshotCmd { - clap_cmd: clap::SubCommand::with_name("snapshot"), - }), - ], - }; - - let matches = App::new("zenith") - .about("Zenith CLI") - .version("1.0") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommands(cli_commands.generate()) - .get_matches(); - - if let Some(subcommand) = matches.subcommand_name() { - println!("'git {}' was used", subcommand); - } - - match matches.subcommand() { - ("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?, - ("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?, - ("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?, - ("", None) => println!("No subcommand"), - _ => unreachable!(), - } - Ok(()) -} diff --git a/pageserver/src/bin/cli/pg.rs b/pageserver/src/bin/cli/pg.rs deleted file mode 100644 index 7fe2f86d6c..0000000000 --- a/pageserver/src/bin/cli/pg.rs +++ /dev/null @@ -1,105 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct PgCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for PgCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith compute nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list").about("List existing compute nodes")) - .subcommand( - App::new("create") - .about( - "Create (init) new data directory using given storage and start postgres", - ) - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("storage") - .short("s") - .long("storage") - .takes_value(true) - .help("Name of the storage node to use"), - ) - //TODO should it be just name of uploaded snapshot or some path? - .arg( - Arg::with_name("snapshot") - .long("snapshot") - .takes_value(true) - .help("Name of the snapshot to use"), - ) - .arg( - Arg::with_name("nostart") - .long("no-start") - .takes_value(false) - .help("Don't start postgres on the created node"), - ), - ) - .subcommand( - App::new("destroy") - .about("Stop postgres and destroy node's data directory") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("start") - .about("Start postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("replica") - .long("replica") - .takes_value(false) - .help("Start the compute node as replica"), - ), - ) - .subcommand( - App::new("stop") - .about("Stop postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("show") - .about("Show info about the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run PgCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/snapshot.rs b/pageserver/src/bin/cli/snapshot.rs deleted file mode 100644 index 47e608b8e2..0000000000 --- a/pageserver/src/bin/cli/snapshot.rs +++ /dev/null @@ -1,27 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct SnapshotCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for SnapshotCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith snapshots") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true))) - .subcommand(App::new("destroy")) - .subcommand(App::new("start")) - .subcommand(App::new("stop")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run SnapshotCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/storage.rs b/pageserver/src/bin/cli/storage.rs deleted file mode 100644 index 71ca61e905..0000000000 --- a/pageserver/src/bin/cli/storage.rs +++ /dev/null @@ -1,25 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -use crate::subcommand; - -pub struct StorageCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for StorageCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith storage nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("attach")) - .subcommand(App::new("detach")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run StorageCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/subcommand.rs b/pageserver/src/bin/cli/subcommand.rs deleted file mode 100644 index 6a9e7363b9..0000000000 --- a/pageserver/src/bin/cli/subcommand.rs +++ /dev/null @@ -1,29 +0,0 @@ -use anyhow::Result; - -/// All subcommands need to implement this interface. -pub trait SubCommand { - /// Generates the cli-config that Clap requires for the subcommand. - fn gen_clap_command(&self) -> clap::App; - - /// Runs the body of the subcommand. - fn run(&self, args: clap::ArgMatches) -> Result<()>; -} - -/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must -/// implement the `SubCommand` trait, but other than that, can be of any type. -pub struct ClapCommands { - pub commands: Vec>, -} - -impl ClapCommands { - /// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in - /// order to generate the full CLI. - pub fn generate(&self) -> Vec { - let mut v: Vec = Vec::new(); - - for command in self.commands.iter() { - v.push(command.gen_clap_command()); - } - v - } -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 0ef258ad6c..b98cca4ca1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -5,10 +5,9 @@ use log::*; use std::fs; use std::io; -use std::path::PathBuf; use std::process::exit; use std::thread; -use std::{fs::File, fs::OpenOptions}; +use std::fs::{File, OpenOptions}; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -17,25 +16,21 @@ use daemonize::Daemonize; use slog::Drain; use pageserver::page_service; -use pageserver::restore_datadir; -use pageserver::restore_s3; use pageserver::tui; -use pageserver::walreceiver; +//use pageserver::walreceiver; use pageserver::PageServerConf; +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") - .arg(Arg::with_name("datadir") - .short("D") - .long("dir") - .takes_value(true) - .help("Path to the page server data directory")) - .arg(Arg::with_name("wal_producer") - .short("w") - .long("wal-producer") - .takes_value(true) - .help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')")) .arg(Arg::with_name("listen") .short("l") .long("listen") @@ -51,25 +46,14 @@ fn main() -> Result<()> { .long("daemonize") .takes_value(false) .help("Run in the background")) - .arg(Arg::with_name("restore_from") - .long("restore-from") - .takes_value(true) - .help("Upload data from s3 or datadir")) .get_matches(); let mut conf = PageServerConf { - data_dir: PathBuf::from("./"), daemonize: false, interactive: false, - wal_producer_connstr: None, - listen_addr: "127.0.0.1:5430".parse().unwrap(), - restore_from: String::new(), + listen_addr: "127.0.0.1:5430".parse().unwrap() }; - if let Some(dir) = arg_matches.value_of("datadir") { - conf.data_dir = PathBuf::from(dir); - } - if arg_matches.is_present("daemonize") { conf.daemonize = true; } @@ -83,14 +67,6 @@ fn main() -> Result<()> { exit(1); } - if let Some(restore_from) = arg_matches.value_of("restore_from") { - conf.restore_from = String::from(restore_from); - } - - if let Some(addr) = arg_matches.value_of("wal_producer") { - conf.wal_producer_connstr = Some(String::from(addr)); - } - if let Some(addr) = arg_matches.value_of("listen") { conf.listen_addr = addr.parse()?; } @@ -125,19 +101,25 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { if conf.daemonize { info!("daemonizing..."); - // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so + let repodir = zenith_repo_dir(); + + // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fprintf's or backtraces. - let log_filename = conf.data_dir.join("pageserver.log"); + let log_filename = repodir.clone() + "pageserver.log"; let stdout = OpenOptions::new() .create(true) .append(true) .open(&log_filename) - .with_context(|| format!("failed to open {:?}", log_filename))?; - let stderr = stdout.try_clone()?; + .with_context(|| format!("failed to open {:?}", &log_filename))?; + let stderr = OpenOptions::new() + .create(true) + .append(true) + .open(&log_filename) + .with_context(|| format!("failed to open {:?}", &log_filename))?; let daemonize = Daemonize::new() - .pid_file(conf.data_dir.join("pageserver.pid")) - .working_directory(conf.data_dir.clone()) + .pid_file(repodir.clone() + "/pageserver.pid") + .working_directory(repodir) .stdout(stdout) .stderr(stderr); @@ -146,24 +128,21 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { Err(e) => error!("Error, {}", e), } } + else + { + // change into the repository directory. In daemon mode, Daemonize + // does this for us. + let repodir = zenith_repo_dir(); + std::env::set_current_dir(&repodir)?; + info!("Changed current directory to repository in {}", &repodir); + } let mut threads = Vec::new(); - info!("starting... {}", conf.restore_from); - - // Before opening up for connections, restore the latest base backup from S3. - // (We don't persist anything to local disk at the moment, so we need to do - // this at every startup) - if conf.restore_from.eq("s3") { - info!("restore-from s3..."); - restore_s3::restore_main(&conf); - } else if conf.restore_from.eq("local") { - info!("restore-from local..."); - restore_datadir::restore_main(&conf); - } + // TODO: Check that it looks like a valid repository before going further // Create directory for wal-redo datadirs - match fs::create_dir(conf.data_dir.join("wal-redo")) { + match fs::create_dir("wal-redo") { Ok(_) => {} Err(e) => match e.kind() { io::ErrorKind::AlreadyExists => {} @@ -173,25 +152,6 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { }, } - // Launch the WAL receiver thread if pageserver was started with --wal-producer - // option. It will try to connect to the WAL safekeeper, and stream the WAL. If - // the connection is lost, it will reconnect on its own. We just fire and forget - // it here. - // - // All other wal receivers are started on demand by "callmemaybe" command - // sent to pageserver. - if let Some(wal_producer) = &conf.wal_producer_connstr { - let conf_copy = conf.clone(); - let wal_producer = wal_producer.clone(); - let walreceiver_thread = thread::Builder::new() - .name("static WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &wal_producer); - }) - .unwrap(); - threads.push(walreceiver_thread); - } - // GetPage@LSN requests are served by another thread. (It uses async I/O, // but the code in page_service sets up it own thread pool for that) let conf_copy = conf.clone(); @@ -220,7 +180,7 @@ fn init_logging(conf: &PageServerConf) -> Result Result, pub listen_addr: SocketAddr, - pub restore_from: String, +} + +// Zenith Timeline ID is a 32-byte random ID. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct ZTimelineId([u8; 16]); + +impl ZTimelineId { + + pub fn from_str(s: &str) -> Result { + let timelineid = hex::decode(s)?; + + let mut buf: [u8; 16] = [0u8; 16]; + buf.copy_from_slice(timelineid.as_slice()); + Ok(ZTimelineId(buf)) + } + + pub fn from(b: [u8; 16]) -> ZTimelineId { + ZTimelineId(b) + } + + pub fn to_str(self: &ZTimelineId) -> String { + hex::encode(self.0) + } +} + +impl std::fmt::Display for ZTimelineId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.to_str()) + } } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index db0a33b55b..20b3460d8c 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -7,6 +7,8 @@ // use crate::{walredo, PageServerConf}; +use crate::restore_local_repo::restore_timeline; +use crate::ZTimelineId; use anyhow::bail; use bytes::Bytes; use core::ops::Bound::Included; @@ -107,30 +109,49 @@ struct PageCacheShared { } lazy_static! { - pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); + pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); } -pub fn get_pagecache(conf: &PageServerConf, sys_id: u64) -> Arc { +// Get Page Cache for given timeline. It is assumed to already exist. +pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option> { + let pcaches = PAGECACHES.lock().unwrap(); + + match pcaches.get(&timelineid) { + Some(pcache) => Some(pcache.clone()), + None => None + } +} + +pub fn get_or_restore_pagecache(conf: &PageServerConf, timelineid: ZTimelineId) -> anyhow::Result> { let mut pcaches = PAGECACHES.lock().unwrap(); - if !pcaches.contains_key(&sys_id) { - pcaches.insert(sys_id, Arc::new(init_page_cache())); + match pcaches.get(&timelineid) { + Some(pcache) => Ok(pcache.clone()), + None => { + let pcache = init_page_cache(); - // Initialize the WAL redo thread - // - // Now join_handle is not saved any where and we won'try restart tharead - // if it is dead. We may later stop that treads after some inactivity period - // and restart them on demand. - let conf = conf.clone(); - let _walredo_thread = thread::Builder::new() - .name("WAL redo thread".into()) - .spawn(move || { - walredo::wal_redo_main(&conf, sys_id); - }) - .unwrap(); + restore_timeline(conf, &pcache, timelineid)?; + + let result = Arc::new(pcache); + + pcaches.insert(timelineid, result.clone()); + + // Initialize the WAL redo thread + // + // Now join_handle is not saved any where and we won'try restart tharead + // if it is dead. We may later stop that treads after some inactivity period + // and restart them on demand. + let conf_copy = conf.clone(); + let _walredo_thread = thread::Builder::new() + .name("WAL redo thread".into()) + .spawn(move || { + walredo::wal_redo_main(&conf_copy, timelineid); + }) + .unwrap(); + + return Ok(result); + } } - - pcaches.get(&sys_id).unwrap().clone() } fn init_page_cache() -> PageCache { @@ -429,7 +450,8 @@ impl PageCache { // Adds a WAL record to the page cache // pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) { - let key = CacheKey { tag, lsn: rec.lsn }; + let lsn = rec.lsn; + let key = CacheKey { tag, lsn }; let entry = CacheEntry::new(key.clone()); entry.content.lock().unwrap().wal_record = Some(rec); @@ -447,13 +469,14 @@ impl PageCache { *rel_entry = tag.blknum + 1; } - trace!("put_wal_record lsn: {}", key.lsn); + //trace!("put_wal_record lsn: {}", lsn); let oldentry = shared.pagecache.insert(key, Arc::new(entry)); self.num_entries.fetch_add(1, Ordering::Relaxed); if !oldentry.is_none() { - error!("overwriting WAL record in page cache"); + error!("overwriting WAL record with LSN {:X}/{:X} in page cache", + lsn >> 32, lsn & 0xffffffff); } self.num_wal_records.fetch_add(1, Ordering::Relaxed); @@ -486,12 +509,17 @@ impl PageCache { let mut shared = self.shared.lock().unwrap(); // Can't move backwards. - assert!(lsn >= shared.last_valid_lsn); + let oldlsn = shared.last_valid_lsn; + if lsn >= oldlsn { - shared.last_valid_lsn = lsn; - self.valid_lsn_condvar.notify_all(); + shared.last_valid_lsn = lsn; + self.valid_lsn_condvar.notify_all(); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } else { + warn!("attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})", + oldlsn >> 32, oldlsn & 0xffffffff, lsn >> 32, lsn & 0xffffffff); + } } // diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9ff0b2cf46..cc972f713e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,29 +7,43 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url +// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use byteorder::{BigEndian, ByteOrder}; -use bytes::{Buf, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; use log::*; use std::io; use std::thread; +use std::sync::Arc; +use regex::Regex; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; use tokio::net::{TcpListener, TcpStream}; use tokio::runtime; +use tokio::runtime::Runtime; use tokio::task; +use tokio::sync::mpsc; use crate::page_cache; +use crate::restore_local_repo; +use crate::basebackup; use crate::walreceiver; use crate::PageServerConf; +use crate::ZTimelineId; + type Result = std::result::Result; #[derive(Debug)] enum FeMessage { StartupMessage(FeStartupMessage), - Query(FeQueryMessage), + Query(FeQueryMessage), // Simple query + Parse(FeParseMessage), // Extended query protocol + Describe(FeDescribeMessage), + Bind(FeBindMessage), + Execute(FeExecuteMessage), + Close(FeCloseMessage), + Sync, Terminate, // @@ -49,6 +63,11 @@ enum BeMessage { AuthenticationOk, ReadyForQuery, RowDescription, + ParseComplete, + ParameterDescription, + NoData, + BindComplete, + CloseComplete, DataRow, CommandComplete, ControlFile, @@ -145,6 +164,180 @@ struct FeQueryMessage { body: Bytes, } +// We only support the simple case of Parse on unnamed prepared statement and +// no params +#[derive(Debug)] +struct FeParseMessage { + query_string: Bytes, +} + +fn read_null_terminated(buf: &mut Bytes) -> Result +{ + let mut result = BytesMut::new(); + + loop { + if !buf.has_remaining() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "no null-terminator in string", + )); + } + + let byte = buf.get_u8(); + + if byte == 0 { + break; + } + result.put_u8(byte); + } + return Ok(result.freeze()); +} + +impl FeParseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _pstmt_name = read_null_terminated(&mut buf)?; + let query_string = read_null_terminated(&mut buf)?; + let nparams = buf.get_i16(); + + // FIXME: the rust-postgres driver uses a named prepared statement + // for copy_out(). We're not prepared to handle that correctly. For + // now, just ignore the statement name, assuming that the client never + // uses more than one prepared statement at a time. + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Parse", + )); + } + */ + + if nparams != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "query params not implemented", + )); + } + + + Ok(FeMessage::Parse(FeParseMessage {query_string})) + } +} + +#[derive(Debug)] +struct FeDescribeMessage { + kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. + // we only support unnamed prepared stmt or portal +} + +impl FeDescribeMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let kind = buf.get_u8(); + let _pstmt_name = read_null_terminated(&mut buf)?; + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Describe", + )); + } + */ + + if kind != 0x53 { // 'S' + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "only prepared statmement Describe is implemented", + )); + } + + + Ok(FeMessage::Describe(FeDescribeMessage {kind})) + } +} + +// we only support unnamed prepared stmt or portal +#[derive(Debug)] +struct FeExecuteMessage { + maxrows: i32// max # of rows +} + +impl FeExecuteMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let maxrows = buf.get_i32(); + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + if maxrows != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "row limit in Execute message not supported", + )); + } + + Ok(FeMessage::Execute(FeExecuteMessage {maxrows})) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeBindMessage { +} + +impl FeBindMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_null_terminated(&mut buf)?; + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented", + )); + } + */ + + Ok(FeMessage::Bind(FeBindMessage {})) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeCloseMessage { +} + +impl FeCloseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _kind = buf.get_u8(); + let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + + // FIXME: we do nothing with Close + + Ok(FeMessage::Close(FeCloseMessage {})) + } +} + impl FeMessage { pub fn parse(buf: &mut BytesMut) -> Result> { if buf.len() < 5 { @@ -173,10 +366,18 @@ impl FeMessage { let mut body = buf.split_to(total_len); body.advance(5); + let mut body = body.freeze(); + match tag { b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { - body: body.freeze(), + body: body, }))), + b'P' => Ok(Some(FeParseMessage::parse(body)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), + b'B' => Ok(Some(FeBindMessage::parse(body)?)), + b'C' => Ok(Some(FeCloseMessage::parse(body)?)), + b'S' => Ok(Some(FeMessage::Sync)), b'X' => Ok(Some(FeMessage::Terminate)), b'd' => { let smgr_tag = body.get_u8(); @@ -228,13 +429,15 @@ pub fn thread_main(conf: &PageServerConf) { info!("Starting page server on {}", conf.listen_addr); - runtime.block_on(async { + let runtime_ref = Arc::new(runtime); + + runtime_ref.clone().block_on(async { let listener = TcpListener::bind(conf.listen_addr).await.unwrap(); loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); - let mut conn_handler = Connection::new(conf.clone(), socket); + let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref); task::spawn(async move { if let Err(err) = conn_handler.run().await { @@ -251,15 +454,17 @@ struct Connection { buffer: BytesMut, init_done: bool, conf: PageServerConf, + runtime: Arc, } impl Connection { - pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection { + pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc) -> Connection { Connection { stream: BufWriter::new(socket), buffer: BytesMut::with_capacity(10 * 1024), init_done: false, conf, + runtime: runtime.clone(), } } @@ -307,6 +512,33 @@ impl Connection { self.stream.write_u8(b'I').await?; } + BeMessage::ParseComplete => { + self.stream.write_u8(b'1').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::BindComplete => { + self.stream.write_u8(b'2').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::CloseComplete => { + self.stream.write_u8(b'3').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::NoData => { + self.stream.write_u8(b'n').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::ParameterDescription => { + self.stream.write_u8(b't').await?; + self.stream.write_i32(6).await?; + // we don't support params, so always 0 + self.stream.write_i16(0).await?; + } + BeMessage::RowDescription => { // XXX let mut b = Bytes::from("data\0"); @@ -396,8 +628,12 @@ impl Connection { } async fn run(&mut self) -> Result<()> { + + let mut unnamed_query_string = Bytes::new(); loop { - match self.read_message().await? { + let msg = self.read_message().await?; + info!("got message {:?}", msg); + match msg { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); @@ -417,7 +653,27 @@ impl Connection { } } Some(FeMessage::Query(m)) => { - self.process_query(&m).await?; + self.process_query(m.body).await?; + } + Some(FeMessage::Parse(m)) => { + unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete).await?; + } + Some(FeMessage::Describe(_)) => { + self.write_message_noflush(&BeMessage::ParameterDescription).await?; + self.write_message(&BeMessage::NoData).await?; + } + Some(FeMessage::Bind(_)) => { + self.write_message(&BeMessage::BindComplete).await?; + } + Some(FeMessage::Close(_)) => { + self.write_message(&BeMessage::CloseComplete).await?; + } + Some(FeMessage::Execute(_)) => { + self.process_query(unnamed_query_string.clone()).await?; + } + Some(FeMessage::Sync) => { + self.write_message(&BeMessage::ReadyForQuery).await?; } Some(FeMessage::Terminate) => { break; @@ -426,7 +682,8 @@ impl Connection { info!("connection closed"); break; } - _ => { + x => { + error!("unexpected message type : {:?}", x); return Err(io::Error::new(io::ErrorKind::Other, "unexpected message")); } } @@ -435,41 +692,59 @@ impl Connection { Ok(()) } - async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> { - trace!("got query {:?}", q.body); + async fn process_query(&mut self, query_string: Bytes) -> Result<()> { + debug!("process query {:?}", query_string); - if q.body.starts_with(b"controlfile") { + // remove null terminator, if any + let mut query_string = query_string.clone(); + if query_string.last() == Some(&0) { + query_string.truncate(query_string.len() - 1); + } + + if query_string.starts_with(b"controlfile") { self.handle_controlfile().await - } else if q.body.starts_with(b"pagestream ") { - let (_l, r) = q.body.split_at("pagestream ".len()); - let mut r = r.to_vec(); - r.pop(); - let sysid = String::from_utf8(r).unwrap().trim().to_string(); - let sysid: u64 = sysid.parse().unwrap(); // XXX + } else if query_string.starts_with(b"pagestream ") { + let (_l, r) = query_string.split_at("pagestream ".len()); + let timelineid_str = String::from_utf8(r.to_vec()).unwrap(); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - self.handle_pagerequests(sysid).await - } else if q.body.starts_with(b"callmemaybe ") { - let (_l, r) = q.body.split_at("callmemaybe ".len()); - let mut r = r.to_vec(); - r.pop(); - let connstr = String::from_utf8(r).unwrap().trim().to_string(); + self.handle_pagerequests(timelineid).await + } else if query_string.starts_with(b"basebackup ") { + let (_l, r) = query_string.split_at("basebackup ".len()); + let r = r.to_vec(); + let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end()); + info!("got basebackup command: \"{}\"", timelineid_str); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - let conf_copy = self.conf.clone(); - let _walreceiver_thread = thread::Builder::new() - .name("WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &connstr); - }) - .unwrap(); + // Check that the timeline exists + self.handle_basebackup_request(timelineid).await?; + self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message(&BeMessage::ReadyForQuery).await + } else if query_string.starts_with(b"callmemaybe ") { + let query_str = String::from_utf8(query_string.to_vec()).unwrap().to_string(); + + // callmemaybe + let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap(); + let caps = re.captures(&query_str); + let caps = caps.unwrap(); + + let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap(); + let connstr: String = String::from(caps.get(2).unwrap().as_str()); + + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid))); + } + + walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr); - // generic ack: - self.write_message_noflush(&BeMessage::RowDescription) - .await?; - self.write_message_noflush(&BeMessage::DataRow).await?; self.write_message_noflush(&BeMessage::CommandComplete) .await?; self.write_message(&BeMessage::ReadyForQuery).await - } else if q.body.starts_with(b"status") { + } else if query_string.starts_with(b"status") { self.write_message_noflush(&BeMessage::RowDescription) .await?; self.write_message_noflush(&BeMessage::DataRow).await?; @@ -495,7 +770,17 @@ impl Connection { self.write_message(&BeMessage::ReadyForQuery).await } - async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> { + async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> { + + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested pagestream on timeline {} which does not exist in page server", timelineid))); + } + let pcache = pcache.unwrap(); + /* switch client to COPYBOTH */ self.stream.write_u8(b'W').await?; self.stream.write_i32(4 + 1 + 2).await?; @@ -503,13 +788,11 @@ impl Connection { self.stream.write_i16(0).await?; /* numAttributes */ self.stream.flush().await?; - let pcache = page_cache::get_pagecache(&self.conf, sysid); - loop { let message = self.read_message().await?; if let Some(m) = &message { - info!("query({}): {:?}", sysid, m); + info!("query({:?}): {:?}", timelineid, m); }; if message.is_none() { @@ -628,4 +911,102 @@ impl Connection { } } } + + async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> { + // check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested basebackup on timeline {} which does not exist in page server", timelineid))); + } + + /* switch client to COPYOUT */ + let stream = &mut self.stream; + stream.write_u8(b'H').await?; + stream.write_i32(4 + 1 + 2).await?; + stream.write_u8(0).await?; /* copy_is_binary */ + stream.write_i16(0).await?; /* numAttributes */ + stream.flush().await?; + info!("sent CopyOut"); + + /* Send a tarball of the latest snapshot on the timeline */ + + // find latest snapshot + let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap(); + + // Stream it + let (s, mut r) = mpsc::channel(5); + + let f_tar = task::spawn_blocking(move || { + basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?; + Ok(()) + }); + let f_tar2 = async { + let joinres = f_tar.await; + + if joinres.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + joinres.unwrap_err() + )); + } + return joinres.unwrap(); + }; + + let f_pump = async move { + loop { + let buf = r.recv().await; + if buf.is_none() { + break; + } + let mut buf = buf.unwrap(); + + // CopyData + stream.write_u8(b'd').await?; + stream.write_u32((4 + buf.len()) as u32).await?; + stream.write_all(&mut buf).await?; + trace!("CopyData sent for {} bytes!", buf.len()); + + // FIXME: flush isn't really required, but makes it easier + // to view in wireshark + stream.flush().await?; + } + Ok(()) + }; + + tokio::try_join!(f_tar2, f_pump)?; + + // CopyDone + self.stream.write_u8(b'c').await?; + self.stream.write_u32(4).await?; + self.stream.flush().await?; + debug!("CopyDone sent!"); + + // FIXME: I'm getting an error from the tokio copyout driver without this. + // I think it happens when the CommandComplete, CloseComplete and ReadyForQuery + // are sent in the same TCP packet as the CopyDone. I don't understand why. + thread::sleep(std::time::Duration::from_secs(1)); + + Ok(()) + } +} + +struct CopyDataSink(mpsc::Sender); + +impl std::io::Write for CopyDataSink { + fn write(&mut self, data: &[u8]) -> std::result::Result { + + let buf = Bytes::copy_from_slice(data); + + if let Err(e) = self.0.blocking_send(buf) { + return Err(io::Error::new(io::ErrorKind::Other, e)); + } + + Ok(data.len()) + } + fn flush(&mut self) -> std::result::Result<(), std::io::Error> { + // no-op + Ok(()) + } } diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs deleted file mode 100644 index 3b4f303bbc..0000000000 --- a/pageserver/src/restore_datadir.rs +++ /dev/null @@ -1,339 +0,0 @@ -// -// Restore chunks from S3 -// -// This runs once at Page Server startup. It loads all the "base images" from -// S3 into the in-memory page cache. It also initializes the "last valid LSN" -// in the page cache to the LSN of the base image, so that when the WAL receiver -// is started, it starts streaming from that LSN. -// - -use bytes::{Buf, BytesMut}; -use log::*; -use regex::Regex; -use std::env; -use std::fmt; - -use tokio::runtime; - -use futures::future; - -use crate::{page_cache, pg_constants, PageServerConf}; -use std::fs; -use walkdir::WalkDir; - -pub fn restore_main(conf: &PageServerConf) { - // Create a new thread pool - let runtime = runtime::Runtime::new().unwrap(); - - runtime.block_on(async { - let result = restore_chunk(conf).await; - - match result { - Ok(_) => { - return; - } - Err(err) => { - error!("error: {}", err); - return; - } - } - }); -} - -async fn restore_chunk(conf: &PageServerConf) -> Result<(), FilePathError> { - let pgdata_base_path = env::var("PGDATA_BASE_PATH").unwrap(); - info!("Restoring from local dir..."); - - let sys_id: u64 = 42; - let control_lsn = 0; //TODO get it from sysid - let mut slurp_futures: Vec<_> = Vec::new(); - - for e in WalkDir::new(pgdata_base_path.clone()) { - let entry = e.unwrap(); - - if !entry.path().is_dir() { - let path = entry.path().to_str().unwrap(); - - let relpath = path - .strip_prefix(&format!("{}/", pgdata_base_path)) - .unwrap(); - info!( - "Restoring file {} relpath {}", - entry.path().display(), - relpath - ); - - let parsed = parse_rel_file_path(&relpath); - - match parsed { - Ok(mut p) => { - p.lsn = control_lsn; - - let f = slurp_base_file(conf, sys_id, path.to_string(), p); - - slurp_futures.push(f); - } - Err(e) => { - warn!("unrecognized file: {} ({})", relpath, e); - } - }; - } - } - - let pcache = page_cache::get_pagecache(conf, sys_id); - pcache.init_valid_lsn(control_lsn); - - info!("{} files to restore...", slurp_futures.len()); - - future::join_all(slurp_futures).await; - info!("restored!"); - Ok(()) -} - -#[derive(Debug)] -struct FilePathError { - msg: String, -} - -impl FilePathError { - fn new(msg: &str) -> FilePathError { - FilePathError { - msg: msg.to_string(), - } - } -} - -impl From for FilePathError { - fn from(e: core::num::ParseIntError) -> Self { - return FilePathError { - msg: format!("invalid filename: {}", e), - }; - } -} - -impl fmt::Display for FilePathError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "invalid filename") - } -} - -fn forkname_to_forknum(forkname: Option<&str>) -> Result { - match forkname { - // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(0), - Some("fsm") => Ok(1), - Some("vm") => Ok(2), - Some("init") => Ok(3), - Some(_) => Err(FilePathError::new("invalid forkname")), - } -} - -#[derive(Debug)] -struct ParsedBaseImageFileName { - pub spcnode: u32, - pub dbnode: u32, - pub relnode: u32, - pub forknum: u32, - pub segno: u32, - - pub lsn: u64, -} - -// formats: -// -// _ -// . -// _. -fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> { - let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); - - let caps = re - .captures(fname) - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - - let relnode_str = caps.name("relnode").unwrap().as_str(); - let relnode = u32::from_str_radix(relnode_str, 10)?; - - let forkname_match = caps.name("forkname"); - let forkname = if forkname_match.is_none() { - None - } else { - Some(forkname_match.unwrap().as_str()) - }; - let forknum = forkname_to_forknum(forkname)?; - - let segno_match = caps.name("segno"); - let segno = if segno_match.is_none() { - 0 - } else { - u32::from_str_radix(segno_match.unwrap().as_str(), 10)? - }; - return Ok((relnode, forknum, segno, 0)); -} - -fn parse_rel_file_path(path: &str) -> Result { - /* - * Relation data files can be in one of the following directories: - * - * global/ - * shared relations - * - * base// - * regular relations, default tablespace - * - * pg_tblspc/// - * within a non-default tablespace (the name of the directory - * depends on version) - * - * And the relation data files themselves have a filename like: - * - * . - */ - if let Some(fname) = path.strip_prefix("global/") { - if fname.contains("pg_control") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_CONTROLFILE_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(dbpath) = path.strip_prefix("base/") { - let mut s = dbpath.split("/"); - let dbnode_str = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - let dbnode = u32::from_str_radix(dbnode_str, 10)?; - let fname = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - if s.next().is_some() { - return Err(FilePathError::new("invalid relation data file name")); - }; - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dbnode, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(fname) = path.strip_prefix("pg_xact/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_XACT_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/members/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/offsets/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(_) = path.strip_prefix("pg_tblspc/") { - // TODO - return Err(FilePathError::new("tablespaces not supported")); - } else { - return Err(FilePathError::new("invalid relation data file name")); - } -} - -async fn slurp_base_file( - conf: &PageServerConf, - sys_id: u64, - file_path: String, - parsed: ParsedBaseImageFileName, -) { - info!("slurp_base_file local path {}", file_path); - - let mut data = fs::read(file_path).unwrap(); - - // pg_filenode.map has non-standard size - 512 bytes - // enlarge it to treat as a regular page - if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM { - data.resize(8192, 0); - } - - let data_bytes: &[u8] = &data; - let mut bytes = BytesMut::from(data_bytes).freeze(); - - // FIXME: use constants (BLCKSZ) - let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192); - - let pcache = page_cache::get_pagecache(conf, sys_id); - - let reltag = page_cache::RelTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - }; - - while bytes.remaining() >= 8192 { - let tag = page_cache::BufferTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - blknum: blknum, - }; - - pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); - - pcache.relsize_inc(&reltag, blknum + 1); - blknum += 1; - } -} diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs new file mode 100644 index 0000000000..c53c04ef92 --- /dev/null +++ b/pageserver/src/restore_local_repo.rs @@ -0,0 +1,460 @@ +// +// Restore chunks from local Zenith repository +// +// This runs once at Page Server startup. It loads all the "snapshots" and all +// WAL from all timelines from the local zenith repository into the in-memory page +// cache. +// +// This also initializes the "last valid LSN" in the page cache to the last LSN +// seen in the WAL, so that when the WAL receiver is started, it starts +// streaming from that LSN. +// + +use log::*; +use regex::Regex; +use std::fmt; + +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::Read; +use std::io::Seek; +use std::io::SeekFrom; +use std::path::{Path, PathBuf}; +use std::cmp::max; + +use anyhow::Result; +use bytes::Bytes; + +use crate::page_cache; +use crate::page_cache::PageCache; +use crate:: PageServerConf; +use crate::page_cache::BufferTag; +use crate::waldecoder::WalStreamDecoder; +use crate::ZTimelineId; + + +// From pg_tablespace_d.h +// +// FIXME: we'll probably need these elsewhere too, move to some common location +const DEFAULTTABLESPACE_OID: u32 = 1663; +const GLOBALTABLESPACE_OID: u32 = 1664; + +// +// Load it all into the page cache. +// +pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId) -> Result<()> { + + let timelinepath = PathBuf::from("timelines").join(&timeline.to_str()); + + if !timelinepath.exists() { + anyhow::bail!("timeline {} does not exist in the page server's repository"); + } + + // Scan .zenith/timelines//snapshots + let snapshotspath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots"; + + let mut last_snapshot_lsn: u64 = 0; + + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + + restore_snapshot(conf, pcache, timeline, &filename)?; + info!("restored snapshot at {}", filename); + } + + if last_snapshot_lsn == 0 { + error!("could not find valid snapshot in {}", &snapshotspath); + // TODO return error? + } + pcache.init_valid_lsn(last_snapshot_lsn); + + restore_wal(conf, pcache, timeline, last_snapshot_lsn)?; + + Ok(()) +} + +pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result { + + let snapshotspath = format!("timelines/{}/snapshots", timeline); + + let mut last_snapshot_lsn = 0; + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + } + + if last_snapshot_lsn == 0 { + error!("could not find valid snapshot in {}", &snapshotspath); + // TODO return error? + } + Ok(last_snapshot_lsn) +} + +fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, snapshot: &str) -> Result<()> { + + let snapshotpath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots/" + snapshot; + + // Scan 'global' + let paths = fs::read_dir(snapshotpath.clone() + "/global").unwrap(); + + for direntry in paths { + let path = direntry.unwrap().path(); + let filename = path.file_name(); + if filename.is_none() { + continue; + } + let filename = filename.unwrap().to_str(); + + if filename == Some("pg_control") { + continue; + } + if filename == Some("pg_filenode.map") { + continue; + } + + restore_relfile(conf, pcache, timeline, snapshot, GLOBALTABLESPACE_OID, 0, &path)?; + } + + // Scan 'base' + let paths = fs::read_dir(snapshotpath.clone() + "/base").unwrap(); + for path in paths { + let path = path.unwrap(); + let filename = path.file_name().to_str().unwrap().to_owned(); + + // Scan database dirs + let dboid = u32::from_str_radix(&filename, 10)?; + + let paths = fs::read_dir(path.path()).unwrap(); + for direntry in paths { + let path = direntry.unwrap().path(); + let filename = path.file_name(); + if filename.is_none() { + continue; + } + let filename = filename.unwrap().to_str(); + if filename == Some("PG_VERSION") { + continue; + } + if filename == Some("pg_filenode.map") { + continue; + } + + restore_relfile(conf, pcache, timeline, snapshot, DEFAULTTABLESPACE_OID, dboid, &path)?; + } + } + + // TODO: Scan pg_tblspc + + Ok(()) +} + +fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimelineId, snapshot: &str, spcoid: u32, dboid: u32, path: &Path) -> Result<()> { + + let lsn = u64::from_str_radix(snapshot, 16)?; + + // Does it look like a relation file? + + let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); + if p.is_err() { + let e = p.unwrap_err(); + warn!("unrecognized file in snapshot: {:?} ({})", path, e); + return Err(e)?; + } + let (relnode, forknum, segno) = p.unwrap(); + + let mut file = File::open(path)?; + let mut buf: [u8; 8192] = [0u8; 8192]; + + // FIXME: use constants (BLCKSZ) + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192); + loop { + let r = file.read_exact(&mut buf); + match r { + Ok(_) => { + let tag = page_cache::BufferTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + blknum: blknum, + }; + pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf)); + /* + if oldest_lsn == 0 || p.lsn < oldest_lsn { + oldest_lsn = p.lsn; + } + */ + } + + // TODO: UnexpectedEof is expected + Err(e) => match e.kind() { + std::io::ErrorKind::UnexpectedEof => { + // reached EOF. That's expected. + // FIXME: maybe check that we read the full length of the file? + break; + }, + _ => { + error!("error reading file: {:?} ({})", path, e); + break; + } + } + }; + blknum += 1; + } + + let tag = page_cache::RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + }; + pcache.relsize_inc(&tag, Some(blknum)); + + Ok(()) +} + +// Scan WAL on a timeline, starting from gien LSN, and load all the records +// into the page cache. +fn restore_wal(_conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, startpoint: u64) -> Result<()> { + let walpath = format!("timelines/{}/wal", timeline); + + let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint)); + + let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024); + let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024); + let mut last_lsn = 0; + loop { + // FIXME: assume postgresql tli 1 for now + let filename = XLogFileName(1, segno, 16 * 1024 * 1024); + let mut path = walpath.clone() + "/" + &filename; + + // It could be as .partial + if !PathBuf::from(&path).exists() { + path = path + ".partial"; + } + + // Slurp the WAL file + let open_result = File::open(&path); + if let Err(e) = open_result { + if e.kind() == std::io::ErrorKind::NotFound { + break; + } + return Err(e)?; + } + let mut file = open_result.unwrap(); + + if offset > 0 { + file.seek(SeekFrom::Start(offset as u64))?; + } + + let mut buf = Vec::new(); + let nread = file.read_to_end(&mut buf)?; + if nread != 16 * 1024 * 1024 - offset as usize { + // Maybe allow this for .partial files? + error!("read only {} bytes from WAL file", nread); + } + waldecoder.feed_bytes(&buf); + + let mut nrecords = 0; + loop { + let rec = waldecoder.poll_decode(); + if rec.is_err() { + // Assume that an error means we've reached the end of + // a partial WAL record. So that's ok. + break; + } + if let Some((lsn, recdata)) = rec.unwrap() { + let decoded = + crate::waldecoder::decode_wal_record(recdata.clone()); + + // Put the WAL record to the page cache. We make a separate copy of + // it for every block it modifies. (The actual WAL record is kept in + // a Bytes, which uses a reference counter for the underlying buffer, + // so having multiple copies of it doesn't cost that much) + for blk in decoded.blocks.iter() { + let tag = BufferTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum as u8, + blknum: blk.blkno, + }; + + let rec = page_cache::WALRecord { + lsn: lsn, + will_init: blk.will_init || blk.apply_image, + rec: recdata.clone(), + }; + + pcache.put_wal_record(tag, rec); + } + + // Now that this record has been handled, let the page cache know that + // it is up-to-date to this LSN + pcache.advance_last_valid_lsn(lsn); + last_lsn = lsn; + } else { + break; + } + nrecords += 1; + } + + info!("restored {} records from WAL file {}", nrecords, filename); + + segno += 1; + offset = 0; + } + info!("reached end of WAL at {:X}/{:X}", last_lsn >> 32, last_lsn & 0xffffffff); + + Ok(()) +} + +// FIXME: copied from xlog_utils.rs +pub const XLOG_FNAME_LEN: usize = 24; +pub type XLogRecPtr = u64; +pub type XLogSegNo = u64; +pub type TimeLineID = u32; + +#[allow(non_snake_case)] +pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + +#[allow(non_snake_case)] +pub fn IsXLogFileName(fname: &str) -> bool { + return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); +} + +#[allow(non_snake_case)] +pub fn IsPartialXLogFileName(fname: &str) -> bool { + if let Some(basefname) = fname.strip_suffix(".partial") { + IsXLogFileName(basefname) + } else { + false + } +} + + +#[derive(Debug, Clone)] +struct FilePathError { + msg: String, +} + +impl Error for FilePathError { + fn description(&self) -> &str { + &self.msg + } +} +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +#[derive(Debug)] +struct ParsedBaseImageFileName { + pub spcnode: u32, + pub dbnode: u32, + pub relnode: u32, + pub forknum: u32, + pub segno: u32, + + pub lsn: u64, +} + +// formats: +// +// _ +// . +// _. + +fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} + diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 957a103f4d..d8ec810f36 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -1,12 +1,8 @@ -//#![allow(non_upper_case_globals)] -//#![allow(non_camel_case_types)] -//#![allow(non_snake_case)] -//#![allow(dead_code)] -//include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - use bytes::{Buf, BufMut, Bytes, BytesMut}; use std::cmp::min; +use std::error::Error; +use std::fmt; use log::*; @@ -19,7 +15,7 @@ const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024; #[repr(C)] #[derive(Debug)] -struct XLogPageHeaderData { +pub struct XLogPageHeaderData { xlp_magic: u16, /* magic value for correctness checks */ xlp_info: u16, /* flag bits, see below */ xlp_tli: u32, /* TimeLineID of first record on page */ @@ -33,7 +29,7 @@ const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4; #[repr(C)] #[derive(Debug)] -struct XLogLongPageHeaderData { +pub struct XLogLongPageHeaderData { std: XLogPageHeaderData, /* standard header fields */ xlp_sysid: u64, /* system identifier from pg_control */ xlp_seg_size: u32, /* just as a cross-check */ @@ -57,6 +53,31 @@ pub struct WalStreamDecoder { recordbuf: BytesMut, } + +#[derive(Debug, Clone)] +pub struct WalDecodeError { + msg: String, +} + +impl Error for WalDecodeError { + fn description(&self) -> &str { + &self.msg + } +} +impl WalDecodeError { + fn new(msg: &str) -> WalDecodeError { + WalDecodeError { + msg: msg.to_string(), + } + } +} + +impl fmt::Display for WalDecodeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "WAL decoding error: {}", self.msg) + } +} + // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream @@ -81,38 +102,46 @@ impl WalStreamDecoder { // Returns a tuple: // (end LSN, record) - pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> { + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { loop { // parse and verify page boundaries as we go if self.lsn % WAL_SEGMENT_SIZE == 0 { // parse long header if self.inputbuf.remaining() < SizeOfXLogLongPHD { - return None; + return Ok(None); } - self.decode_XLogLongPageHeaderData(); + let hdr = self.decode_XLogLongPageHeaderData(); + if hdr.std.xlp_pageaddr != self.lsn { + return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + self.lsn >> 32, + self.lsn & 0xffffffff))); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogLongPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.lsn % (XLOG_BLCKSZ as u64) == 0 { // parse page header if self.inputbuf.remaining() < SizeOfXLogShortPHD { - return None; + return Ok(None); } - self.decode_XLogPageHeaderData(); + let hdr = self.decode_XLogPageHeaderData(); + if hdr.xlp_pageaddr != self.lsn { + return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + self.lsn >> 32, + self.lsn & 0xffffffff))); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogShortPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.padlen > 0 { if self.inputbuf.remaining() < self.padlen as usize { - return None; + return Ok(None); } // skip padding @@ -123,20 +152,17 @@ impl WalStreamDecoder { // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { - return None; + return Ok(None); } // read xl_tot_len FIXME: assumes little-endian self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - error!( - "invalid xl_tot_len {} at {:X}/{:X}", - xl_tot_len, - self.lsn >> 32, - self.lsn & 0xffffffff - ); - panic!(); + return Err(WalDecodeError::new(&format!("invalid xl_tot_len {} at {:X}/{:X}", + xl_tot_len, + self.lsn >> 32, + self.lsn & 0xffffffff))); } self.lsn += 4; @@ -154,7 +180,7 @@ impl WalStreamDecoder { let n = min(self.contlen, pageleft) as usize; if self.inputbuf.remaining() < n { - return None; + return Ok(None); } self.recordbuf.put(self.inputbuf.split_to(n)); @@ -182,7 +208,7 @@ impl WalStreamDecoder { } let result = (self.lsn, recordbuf); - return Some(result); + return Ok(Some(result)); } continue; } @@ -289,7 +315,6 @@ pub struct DecodedBkpBlock { const SizeOfXLogRecord: u32 = 24; pub struct DecodedWALRecord { - pub lsn: u64, // LSN at the *end* of the record pub record: Bytes, // raw XLogRecord pub blocks: Vec, @@ -321,14 +346,7 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool { // // Routines to decode a WAL record and figure out which blocks are modified // -pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { - trace!( - "decoding record with LSN {:08X}/{:08X} ({} bytes)", - lsn >> 32, - lsn & 0xffff_ffff, - rec.remaining() - ); - +pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord { let mut buf = rec.clone(); // FIXME: assume little-endian here @@ -584,7 +602,6 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { // Since we don't care about the data payloads here, we're done. return DecodedWALRecord { - lsn, record: rec, blocks, }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 5ca5ffa199..3f8fcd8722 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -10,22 +10,72 @@ use crate::page_cache; use crate::page_cache::BufferTag; use crate::waldecoder::{decode_wal_record, WalStreamDecoder}; use crate::PageServerConf; +use crate::ZTimelineId; use anyhow::Error; +use lazy_static::lazy_static; use log::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; +use std::collections::HashMap; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::{Write, Seek, SeekFrom}; +use std::path::PathBuf; use std::str::FromStr; +use std::sync::Mutex; +use std::thread; use tokio::runtime; use tokio::time::{sleep, Duration}; use tokio_postgres::replication::{PgTimestamp, ReplicationStream}; use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow}; use tokio_stream::StreamExt; +// +// We keep one WAL Receiver active per timeline. +// +struct WalReceiverEntry { + wal_producer_connstr: String, +} + +lazy_static! { + static ref WAL_RECEIVERS: Mutex> = Mutex::new(HashMap::new()); +} + +// Launch a new WAL receiver, or tell one that's running about change in connection string +pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + + match receivers.get_mut(&timelineid) { + Some(receiver) => { + receiver.wal_producer_connstr = wal_producer_connstr.into(); + } + None => { + let receiver = WalReceiverEntry { wal_producer_connstr: wal_producer_connstr.into() }; + receivers.insert(timelineid, receiver); + + // Also launch a new thread to handle this connection + let conf_copy = conf.clone(); + let _walreceiver_thread = thread::Builder::new() + .name("WAL receiver thread".into()) + .spawn(move || { + thread_main(&conf_copy, timelineid); + }).unwrap(); + } + }; +} + +// Look up current WAL producer connection string in the hash table +fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { + let receivers = WAL_RECEIVERS.lock().unwrap(); + + receivers.get(&timelineid).unwrap().wal_producer_connstr.clone() +} + // // This is the entry point for the WAL receiver thread. // -pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { - info!("WAL receiver thread started: '{}'", wal_producer_connstr); +fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("WAL receiver thread started for timeline : '{}'", timelineid); let runtime = runtime::Builder::new_current_thread() .enable_all() @@ -34,7 +84,10 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { runtime.block_on(async { loop { - let res = walreceiver_main(conf, wal_producer_connstr).await; + // Look up the current WAL producer address + let wal_producer_connstr = get_wal_producer_connstr(timelineid); + + let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await; if let Err(e) = res { info!( @@ -47,7 +100,7 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { }); } -async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> Result<(), Error> { +async fn walreceiver_main(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) -> Result<(), Error> { // Connect to the database in replication mode. info!("connecting to {:?}", wal_producer_connstr); let connect_cfg = format!("{} replication=true", wal_producer_connstr); @@ -67,7 +120,7 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let end_of_wal = u64::from(identify.xlogpos); let mut caught_up = false; - let pcache = page_cache::get_pagecache(conf, identify.systemid); + let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap(); // // Start streaming the WAL, from where we left off previously. @@ -95,9 +148,10 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> } } debug!( - "starting replication from {:X}/{:X}, server is at {:X}/{:X}...", + "starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...", (startpoint >> 32), (startpoint & 0xffffffff), + timelineid, (end_of_wal >> 32), (end_of_wal & 0xffffffff) ); @@ -120,6 +174,11 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let startlsn = xlog_data.wal_start(); let endlsn = startlsn + data.len() as u64; + write_wal_file(startlsn, + timelineid, + 16 * 1024 * 1024, // FIXME + data)?; + trace!( "received XLogData between {:X}/{:X} and {:X}/{:X}", (startlsn >> 32), @@ -131,8 +190,8 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> waldecoder.feed_bytes(data); loop { - if let Some((lsn, recdata)) = waldecoder.poll_decode() { - let decoded = decode_wal_record(startlsn, recdata.clone()); + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let decoded = decode_wal_record(recdata.clone()); // Put the WAL record to the page cache. We make a separate copy of // it for every block it modifies. (The actual WAL record is kept in @@ -260,3 +319,153 @@ pub async fn identify_system(client: &tokio_postgres::Client) -> Result u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + +#[allow(non_snake_case)] +pub fn XLogSegNoOffsetToRecPtr( + segno: XLogSegNo, + offset: u32, + wal_segsz_bytes: usize, +) -> XLogRecPtr { + return segno * (wal_segsz_bytes as u64) + (offset as u64); +} + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + + +fn write_wal_file( + startpos: XLogRecPtr, + timeline: ZTimelineId, + wal_seg_size: usize, + buf: &[u8], +) -> anyhow::Result<()> { + let mut bytes_left: usize = buf.len(); + let mut bytes_written: usize = 0; + let mut partial; + let mut start_pos = startpos; + const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ]; + + let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline)); + + /* Extract WAL location for this block */ + let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize; + + while bytes_left != 0 { + let bytes_to_write; + + /* + * If crossing a WAL boundary, only write up until we reach wal + * segment size. + */ + if xlogoff + bytes_left > wal_seg_size { + bytes_to_write = wal_seg_size - xlogoff; + } else { + bytes_to_write = bytes_left; + } + + /* Open file */ + let segno = XLByteToSeg(start_pos, wal_seg_size); + let wal_file_name = XLogFileName(1, // FIXME: always use Postgres timeline 1 + segno, wal_seg_size); + let wal_file_path = wal_dir + .join(wal_file_name.clone()); + let wal_file_partial_path = wal_dir + .join(wal_file_name.clone() + ".partial"); + + { + let mut wal_file: File; + /* Try to open already completed segment */ + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + wal_file = file; + partial = false; + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) + { + /* Try to open existed partial file */ + wal_file = file; + partial = true; + } else { + /* Create and fill new partial file */ + partial = true; + match OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + { + Ok(mut file) => { + for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { + file.write_all(&ZERO_BLOCK)?; + } + wal_file = file; + } + Err(e) => { + error!("Failed to open log file {:?}: {}", &wal_file_path, e); + return Err(e.into()); + } + } + } + wal_file.seek(SeekFrom::Start(xlogoff as u64))?; + wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; + + // FIXME: Flush the file + //wal_file.sync_all()?; + } + /* Write was successful, advance our position */ + bytes_written += bytes_to_write; + bytes_left -= bytes_to_write; + start_pos += bytes_to_write as u64; + xlogoff += bytes_to_write; + + /* Did we reach the end of a WAL segment? */ + if XLogSegmentOffset(start_pos, wal_seg_size) == 0 { + xlogoff = 0; + if partial { + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } + } + } + Ok(()) +} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e3a0510080..9b0010a1be 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,7 +22,7 @@ use std::io::Error; use std::sync::Arc; use std::time::Duration; use std::time::Instant; -use std::{path::PathBuf, process::Stdio}; +use std::process::Stdio; use tokio::io::AsyncBufReadExt; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::process::{Child, ChildStdin, ChildStdout, Command}; @@ -35,14 +35,15 @@ use crate::page_cache; use crate::page_cache::CacheEntry; use crate::page_cache::WALRecord; use crate::{page_cache::BufferTag, PageServerConf}; +use crate::ZTimelineId; static TIMEOUT: Duration = Duration::from_secs(20); // // Main entry point for the WAL applicator thread. // -pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { - info!("WAL redo thread started {}", sys_id); +pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("WAL redo thread started {}", timelineid); // We block on waiting for requests on the walredo request channel, but // use async I/O to communicate with the child process. Initialize the @@ -52,15 +53,15 @@ pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { .build() .unwrap(); - let pcache = page_cache::get_pagecache(conf, sys_id); + let pcache = page_cache::get_pagecache(conf, timelineid).unwrap(); // Loop forever, handling requests as they come. let walredo_channel_receiver = &pcache.walredo_receiver; loop { let mut process: WalRedoProcess; - let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id)); + let datadir = format!("wal-redo/{}", timelineid); - info!("launching WAL redo postgres process {}", sys_id); + info!("launching WAL redo postgres process {}", timelineid); { let _guard = runtime.enter(); process = WalRedoProcess::launch(&datadir, &runtime).unwrap(); @@ -147,13 +148,13 @@ impl WalRedoProcess { // Tests who run pageserver binary are setting proper PG_BIN_DIR // and PG_LIB_DIR so that WalRedo would start right postgres. We may later // switch to setting same things in pageserver config file. - fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result { + fn launch(datadir: &str, runtime: &Runtime) -> Result { // Create empty data directory for wal-redo postgres deleting old one. - fs::remove_dir_all(datadir.to_str().unwrap()).ok(); + fs::remove_dir_all(datadir).ok(); let initdb = runtime .block_on( Command::new("initdb") - .args(&["-D", datadir.to_str().unwrap()]) + .args(&["-D", datadir]) .arg("-N") .output(), ) @@ -173,14 +174,11 @@ impl WalRedoProcess { .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) - .env("PGDATA", datadir.to_str().unwrap()) + .env("PGDATA", datadir) .spawn() .expect("postgres --wal-redo command failed to start"); - info!( - "launched WAL redo postgres process on {}", - datadir.to_str().unwrap() - ); + info!("launched WAL redo postgres process on {}", datadir); let stdin = child.stdin.take().expect("failed to open child's stdin"); let stderr = child.stderr.take().expect("failed to open child's stderr"); diff --git a/vendor/postgres b/vendor/postgres index d143241a16..5eaf718d3f 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit d143241a1653d3825d94d645801c62c7755b1015 +Subproject commit 5eaf718d3f2fae700fb4902326a4c1d2cee87b51 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index 98c63c434f..27498ee293 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -34,3 +34,6 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } anyhow = "1.0" crc32c = "0.6.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index d50467ba49..00576f055e 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -13,6 +13,8 @@ use clap::{App, Arg}; use slog::Drain; +use pageserver::ZTimelineId; + use walkeeper::wal_service; use walkeeper::WalAcceptorConf; @@ -26,6 +28,12 @@ fn main() -> Result<(), io::Error> { .takes_value(true) .help("Path to the WAL acceptor data directory"), ) + .arg( + Arg::with_name("timelineid") + .long("timelineid") + .takes_value(true) + .help("zenith timeline id"), + ) .arg( Arg::with_name("listen") .short("l") @@ -58,6 +66,7 @@ fn main() -> Result<(), io::Error> { let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), + timelineid: ZTimelineId::from([0u8; 16]), daemonize: false, no_sync: false, pageserver_addr: None, @@ -68,6 +77,10 @@ fn main() -> Result<(), io::Error> { conf.data_dir = PathBuf::from(dir); } + if let Some(timelineid_str) = arg_matches.value_of("timelineid") { + conf.timelineid = ZTimelineId::from_str(timelineid_str).unwrap(); + } + if arg_matches.is_present("no-sync") { conf.no_sync = true; } @@ -98,7 +111,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { info!("daemonizing..."); // There should'n be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fpritf's or backtraces. + // that we will see any accidental manual fprintf's or backtraces. let stdout = OpenOptions::new() .create(true) .append(true) diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 7e890cf98a..5f2f557b49 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -6,9 +6,12 @@ mod pq_protocol; pub mod wal_service; pub mod xlog_utils; +use pageserver::ZTimelineId; + #[derive(Debug, Clone)] pub struct WalAcceptorConf { pub data_dir: PathBuf, + pub timelineid: ZTimelineId, pub daemonize: bool, pub no_sync: bool, pub listen_addr: SocketAddr, diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 6e17f41f06..1a8f764598 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -563,7 +563,8 @@ impl Connection { "no_user", ); let callme = format!( - "callmemaybe host={} port={} replication=1 options='-c system.id={}'", + "callmemaybe {} host={} port={} replication=1 options='-c system.id={}'", + self.conf.timelineid, self.conf.listen_addr.ip(), self.conf.listen_addr.port(), self.system().get_info().server.system_id, diff --git a/walkeeper/src/xlog_utils.rs b/walkeeper/src/xlog_utils.rs index 51db9681a6..7c18131186 100644 --- a/walkeeper/src/xlog_utils.rs +++ b/walkeeper/src/xlog_utils.rs @@ -4,7 +4,7 @@ use log::*; use std::cmp::min; use std::fs::{self, File}; use std::io::prelude::*; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::SystemTime; pub const XLOG_FNAME_LEN: usize = 24; @@ -89,7 +89,7 @@ pub fn get_current_timestamp() -> TimestampTz { } fn find_end_of_wal_segment( - data_dir: &PathBuf, + data_dir: &Path, segno: XLogSegNo, tli: TimeLineID, wal_seg_size: usize, @@ -185,7 +185,7 @@ fn find_end_of_wal_segment( } pub fn find_end_of_wal( - data_dir: &PathBuf, + data_dir: &Path, wal_seg_size: usize, precise: bool, ) -> (XLogRecPtr, TimeLineID) { diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 2d1f7c922c..035fcc9d94 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -8,4 +8,10 @@ edition = "2018" [dependencies] clap = "2.33.0" +anyhow = "1.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 5e34c655b9..de29f386a0 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,29 +1,66 @@ -use clap::{App, Arg, ArgMatches, SubCommand}; -use std::error; +use std::fs; +use std::path::{Path, PathBuf}; use std::process::exit; +use clap::{App, Arg, ArgMatches, SubCommand}; +use anyhow::Result; +use anyhow::*; + use control_plane::{compute::ComputeControlPlane, local_env, storage}; +use control_plane::local_env::LocalEnv; +use control_plane::storage::PageServerNode; -type Result = std::result::Result>; +use pageserver::ZTimelineId; -fn main() { +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + +// Main entry point for the 'zenith' CLI utility +// +// This utility can used to work with a local zenith repository. +// In order to run queries in it, you need to launch the page server, +// and a compute node against the page server +fn main() -> Result<()> { let name_arg = Arg::with_name("NAME") .short("n") .index(1) .help("name of this postgres instance") .required(true); let matches = App::new("zenith") - .subcommand(SubCommand::with_name("init")) - .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) - .subcommand(SubCommand::with_name("status")) + .about("Zenith CLI") + .subcommand(SubCommand::with_name("init") + .about("Initialize a new Zenith repository in current directory")) + .subcommand(SubCommand::with_name("branch") + .about("Create a new branch") + .arg(Arg::with_name("branchname") + .required(false) + .index(1)) + .arg(Arg::with_name("start-point") + .required(false) + .index(2))) + .subcommand( + SubCommand::with_name("pageserver") + .about("Manage pageserver instance") + .subcommand(SubCommand::with_name("status")) + .subcommand(SubCommand::with_name("start")) + .subcommand(SubCommand::with_name("stop")) + ) .subcommand( SubCommand::with_name("pg") .about("Manage postgres instances") .subcommand( - SubCommand::with_name("create"), // .arg(name_arg.clone() - // .required(false) - // .help("name of this postgres instance (will be pgN if omitted)")) + SubCommand::with_name("create") + // .arg(name_arg.clone() + // .required(false) + // .help("name of this postgres instance (will be pgN if omitted)")) + .arg(Arg::with_name("timeline") + .required(false) + .index(1)) ) .subcommand(SubCommand::with_name("list")) .subcommand(SubCommand::with_name("start").arg(name_arg.clone())) @@ -33,24 +70,24 @@ fn main() { .get_matches(); // handle init separately and exit - if let Some("init") = matches.subcommand_name() { - match local_env::init() { - Ok(_) => { - println!("Initialization complete! You may start zenith with 'zenith start' now."); - exit(0); - } - Err(e) => { - eprintln!("Error during init: {}", e); - exit(1); - } - } + if let ("init", Some(sub_args)) = matches.subcommand() { + run_init_cmd(sub_args.clone())?; + exit(0); } // all other commands would need config - let env = match local_env::load_config() { + + let repopath = PathBuf::from(zenith_repo_dir()); + if !repopath.exists() { + bail!("Zenith repository does not exists in {}.\n\ + Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'", + repopath.display()); + } + // TODO: check that it looks like a zenith repository + let env = match local_env::load_config(&repopath) { Ok(conf) => conf, Err(e) => { - eprintln!("Error loading config from ~/.zenith: {}", e); + eprintln!("Error loading config from {}: {}", repopath.display(), e); exit(1); } }; @@ -60,6 +97,9 @@ fn main() { panic!() /* Should not happen. Init was handled before */ } + ("branch", Some(sub_args)) => run_branch_cmd(&env, sub_args.clone())?, + ("pageserver", Some(sub_args)) => run_pageserver_cmd(&env, sub_args.clone())?, + ("start", Some(_sub_m)) => { let pageserver = storage::PageServerNode::from_env(&env); @@ -86,15 +126,53 @@ fn main() { } } _ => {} - } + }; + + Ok(()) +} + +fn run_pageserver_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + match args.subcommand() { + ("status", Some(_sub_m)) => { + todo!(); + } + ("start", Some(_sub_m)) => { + let psnode = PageServerNode::from_env(local_env); + psnode.start()?; + println!("Page server started"); + } + ("stop", Some(_sub_m)) => { + todo!(); + } + _ => unreachable!(), + }; + + Ok(()) +} + +// Peek into the repository, to grab the timeline ID of given branch +pub fn get_branch_timeline(repopath: &Path, branchname: &str) -> ZTimelineId { + let branchpath = repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() } fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; match pg_match.subcommand() { - ("create", Some(_sub_m)) => { - cplane.new_node()?; + ("create", Some(sub_m)) => { + // FIXME: cheat and resolve the timeline by peeking into the + // repository. In reality, when you're launching a compute node + // against a possibly-remote page server, we wouldn't know what + // branches exist in the remote repository. Or would we require + // that you "zenith fetch" them into a local repoitory first? + let timeline_arg = sub_m.value_of("timeline").unwrap_or("main"); + let timeline = get_branch_timeline(&env.repo_path, timeline_arg); + + println!("Initializing Postgres on timeline {}...", timeline); + + cplane.new_node(timeline)?; } ("list", Some(_sub_m)) => { println!("NODE\tADDRESS\tSTATUS"); @@ -107,7 +185,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.start()?; } ("stop", Some(sub_m)) => { @@ -115,7 +193,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.stop()?; } @@ -124,3 +202,128 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Ok(()) } + + +// "zenith init" - Initialize a new Zenith repository in current dir +fn run_init_cmd(_args: ArgMatches) -> Result<()> { + local_env::init()?; + Ok(()) +} + +// handle "zenith branch" subcommand +fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + let repopath = local_env.repo_path.to_str().unwrap(); + + if let Some(branchname) = args.value_of("branchname") { + if PathBuf::from(format!("{}/refs/branches/{}", repopath, branchname)).exists() { + anyhow::bail!("branch {} already exists", branchname); + } + + if let Some(startpoint_str) = args.value_of("start-point") { + + let mut startpoint = parse_point_in_time(startpoint_str)?; + + if startpoint.lsn == 0 { + // Find end of WAL on the old timeline + let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?; + + println!("branching at end of WAL: {:X}/{:X}", end_of_wal >> 32, end_of_wal & 0xffffffff); + + startpoint.lsn = end_of_wal; + } + + return local_env::create_branch(local_env, branchname, startpoint); + + } else { + panic!("Missing start-point"); + } + } else { + // No arguments, list branches + list_branches(); + } + Ok(()) +} + +fn list_branches() { + // list branches + let paths = fs::read_dir(zenith_repo_dir() + "/refs/branches").unwrap(); + + for path in paths { + let filename = path.unwrap().file_name().to_str().unwrap().to_owned(); + println!(" {}", filename); + } +} + +// +// Parse user-given string that represents a point-in-time. +// +// We support multiple variants: +// +// Raw timeline id in hex, meaning the end of that timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d +// +// A specific LSN on a timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 +// +// Same, with a human-friendly branch name: +// main +// main@2/15D3DD8 +// +// Human-friendly tag name: +// mytag +// +// +fn parse_point_in_time(s: &str) -> Result { + + let mut strings = s.split("@"); + let name = strings.next().unwrap(); + + let lsn: Option; + if let Some(lsnstr) = strings.next() { + let mut s = lsnstr.split("/"); + let lsn_hi: u64 = s.next().unwrap().parse()?; + let lsn_lo: u64 = s.next().unwrap().parse()?; + lsn = Some(lsn_hi << 32 | lsn_lo); + } + else { + lsn = None + } + + // Check if it's a tag + if lsn.is_none() { + let tagpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/tags/" + name); + if tagpath.exists() { + let pointstr = fs::read_to_string(tagpath)?; + + return parse_point_in_time(&pointstr); + } + } + // Check if it's a branch + // Check if it's branch @ LSN + let branchpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/branches/" + name); + if branchpath.exists() { + let pointstr = fs::read_to_string(branchpath)?; + + let mut result = parse_point_in_time(&pointstr)?; + if lsn.is_some() { + result.lsn = lsn.unwrap(); + } else { + result.lsn = 0; + } + return Ok(result); + } + + // Check if it's a timelineid + // Check if it's timelineid @ LSN + let tlipath:PathBuf = PathBuf::from(zenith_repo_dir() + "/timelines/" + name); + if tlipath.exists() { + let result = local_env::PointInTime { + timelineid: ZTimelineId::from_str(name)?, + lsn: lsn.unwrap_or(0) + }; + + return Ok(result); + } + + panic!("could not parse point-in-time {}", s); +} From f69db1740961608f0abccba6e48b10b31986d8af Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Apr 2021 15:25:50 +0300 Subject: [PATCH 08/15] Make WAL safekeeper work with zenith timelines --- control_plane/src/compute.rs | 33 +--- control_plane/src/local_env.rs | 63 +++++--- control_plane/src/storage.rs | 7 + integration_tests/tests/test_wal_acceptor.rs | 50 +++++-- pageserver/src/bin/pageserver.rs | 7 +- pageserver/src/lib.rs | 10 ++ pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 19 ++- pageserver/src/waldecoder.rs | 6 +- pageserver/src/walreceiver.rs | 2 +- walkeeper/src/bin/wal_acceptor.rs | 32 ++-- walkeeper/src/lib.rs | 4 +- walkeeper/src/pq_protocol.rs | 18 ++- walkeeper/src/wal_service.rs | 150 ++++++++++--------- 14 files changed, 228 insertions(+), 175 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 8157c62a8b..c2b29e7397 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,4 +1,3 @@ -use std::fs::File; use std::fs::{self, OpenOptions}; use std::os::unix::fs::PermissionsExt; use std::net::TcpStream; @@ -402,40 +401,18 @@ impl PostgresNode { Client::connect(connstring.as_str(), NoTls).unwrap() } - /* Create stub controlfile and respective xlog to start computenode */ - pub fn setup_controlfile(&self) { - let filepath = format!("{}/global/pg_control", self.pgdata().to_str().unwrap()); - - { - File::create(filepath).unwrap(); - } - - let pg_resetwal_path = self.env.pg_bin_dir().join("pg_resetwal"); - - let pg_resetwal = Command::new(pg_resetwal_path) - .args(&["-D", self.pgdata().to_str().unwrap()]) - .arg("-f") - // TODO probably we will have to modify pg_resetwal - // .arg("--compute-node") - .status() - .expect("failed to execute pg_resetwal"); - - if !pg_resetwal.success() { - panic!("pg_resetwal failed"); - } - } - - pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode { + pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode { let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy"); match Command::new(proxy_path.as_path()) - .args(&["-s", &wal_acceptors]) + .args(&["--ztimelineid", &self.timelineid.to_str()]) + .args(&["-s", wal_acceptors]) .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") .stderr(OpenOptions::new() + .create(true) .append(true) - .open(self.env.repo_path.join("safepkeeper_proxy.log")) - .unwrap()) + .open(self.pgdata().join("safekeeper_proxy.log")).unwrap()) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ebbcba7f26..5ac5cb8fd2 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -7,9 +7,10 @@ use std::env; use std::fs; use std::path::{Path, PathBuf}; -use std::process::Command; +use std::process::{Command, Stdio}; use bytes::Bytes; use rand::Rng; +use anyhow::Context; use hex; use serde_derive::{Deserialize, Serialize}; @@ -29,6 +30,9 @@ pub struct LocalEnv { // Path to the Repository. Here page server and compute nodes will create and store their data. pub repo_path: PathBuf, + // System identifier, from the PostgreSQL control file + pub systemid: u64, + // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that @@ -96,40 +100,32 @@ pub fn init() -> Result<()> { } // ok, we are good to go - let conf = LocalEnv { + let mut conf = LocalEnv { repo_path: repo_path.clone(), pg_distrib_dir, zenith_distrib_dir, + systemid: 0, }; - init_repo(&conf)?; - - // write config - let toml = toml::to_string(&conf)?; - fs::write(repo_path.join("config"), toml)?; + init_repo(&mut conf)?; Ok(()) } -pub fn init_repo(local_env: &LocalEnv) -> Result<()> +pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> { let repopath = String::from(local_env.repo_path.to_str().unwrap()); - fs::create_dir(&repopath)?; + fs::create_dir(&repopath).with_context(|| format!("could not create directory {}", repopath))?; fs::create_dir(repopath.clone() + "/pgdatadirs")?; fs::create_dir(repopath.clone() + "/timelines")?; fs::create_dir(repopath.clone() + "/refs")?; fs::create_dir(repopath.clone() + "/refs/branches")?; fs::create_dir(repopath.clone() + "/refs/tags")?; - - // Create empty config file - let configpath = repopath.clone() + "/config"; - fs::write(&configpath, r##" -# Example config file. Nothing here yet. -"##) - .expect(&format!("Unable to write file {}", &configpath)); + println!("created directory structure in {}", repopath); // Create initial timeline let tli = create_timeline(&local_env, None)?; let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + println!("created initial timeline {}", timelinedir); // Run initdb // @@ -139,32 +135,50 @@ pub fn init_repo(local_env: &LocalEnv) -> Result<()> let initdb_path = local_env.pg_bin_dir().join("initdb"); let _initdb = Command::new(initdb_path) - .args(&["-D", "tmp", "--no-instructions"]) + .args(&["-D", "tmp"]) + .arg("--no-instructions") + .env_clear() + .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) + .stdout(Stdio::null()) .status() - .expect("failed to execute initdb"); + .with_context(|| "failed to execute initdb")?; + println!("initdb succeeded"); - // Read control file to extract the LSN + // Read control file to extract the LSN and system id let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; - + let systemid = controlfile.system_identifier; let lsn = controlfile.checkPoint; let lsnstr = format!("{:016X}", lsn); // Move the initial WAL file fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.clone() + "/wal/000000010000000000000001.partial")?; + println!("moved initial WAL file"); // Remove pg_wal fs::remove_dir_all("tmp/pg_wal")?; + println!("removed tmp/pg_wal"); force_crash_recovery(&PathBuf::from("tmp"))?; + println!("updated pg_control"); let target = timelinedir.clone() + "/snapshots/" + &lsnstr; - fs::rename("tmp", target)?; + fs::rename("tmp", &target)?; + println!("moved 'tmp' to {}", &target); // Create 'main' branch to refer to the initial timeline let data = hex::encode(tli); fs::write(repopath.clone() + "/refs/branches/main", data)?; + println!("created main branch"); + + // Also update the system id in the LocalEnv + local_env.systemid = systemid; + + // write config + let toml = toml::to_string(&local_env)?; + fs::write(repopath.clone() + "/config", toml)?; println!("new zenith repository was created in {}", &repopath); + Ok(()) } @@ -209,17 +223,20 @@ pub fn load_config(repopath: &Path) -> Result { // local env for tests pub fn test_env(testname: &str) -> LocalEnv { + fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check"); + let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check/").join(testname); // Remove remnants of old test repo let _ = fs::remove_dir_all(&repo_path); - let local_env = LocalEnv { + let mut local_env = LocalEnv { repo_path, pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"), zenith_distrib_dir: cargo_bin_dir(), + systemid: 0, }; - init_repo(&local_env).unwrap(); + init_repo(&mut local_env).expect("could not initialize zenith repository"); return local_env; } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index dd935cb4fb..f2dbf8dc1a 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -176,6 +176,7 @@ impl PageServerNode { cmd .args(&["-l", self.address().to_string().as_str()]) .arg("-d") .env_clear() + .env("RUST_BACKTRACE", "1") .env("ZENITH_REPO_DIR", self.repo_path()) .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); @@ -294,6 +295,12 @@ impl WalAcceptorNode { let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor")) .args(&["-D", self.data_dir.to_str().unwrap()]) .args(&["-l", self.listen.to_string().as_str()]) + .args(&["--systemid", &self.env.systemid.to_string()]) + // Tell page server it can receive WAL from this WAL safekeeper + // FIXME: If there are multiple safekeepers, they will all inform + // the page server. Only the last "notification" will stay in effect. + // So it's pretty random which safekeeper the page server will connect to + .args(&["--pageserver", "127.0.0.1:64000"]) .arg("-d") .arg("-n") .status() diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index 316a098afe..04ca933d74 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -2,6 +2,8 @@ use control_plane::compute::ComputeControlPlane; use control_plane::storage::TestStorageControlPlane; use control_plane::local_env; +use control_plane::local_env::PointInTime; +use pageserver::ZTimelineId; use rand::Rng; use std::sync::Arc; @@ -23,7 +25,7 @@ fn test_acceptors_normal_work() { node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( @@ -44,23 +46,39 @@ fn test_acceptors_normal_work() { // check wal files equality } +// Run page server and multiple safekeepers, and multiple compute nodes running +// against different timelines. #[test] -fn test_multitenancy() { - // Start pageserver that reads WAL directly from that postgres +fn test_many_timelines() { + // Initialize a new repository, and set up WAL safekeepers and page server. const REDUNDANCY: usize = 3; - const N_NODES: usize = 5; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + const N_TIMELINES: usize = 5; + let local_env = local_env::test_env("test_many_timelines"); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); - // start postgres + // Create branches + let mut timelines: Vec = Vec::new(); + let maintli = storage_cplane.get_branch_timeline("main"); // main branch + timelines.push(maintli); + let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); + for i in 1..N_TIMELINES { // additional branches + let branchname = format!("experimental{}", i); + local_env::create_branch(&local_env, &branchname, + PointInTime { timelineid: maintli, + lsn: startpoint }).unwrap(); + let tli = storage_cplane.get_branch_timeline(&branchname); + timelines.push(tli); + } + + // start postgres on each timeline let mut nodes = Vec::new(); - let mut proxies = Vec::new(); - for _ in 0..N_NODES { - let node = compute_cplane.new_test_master_node(); - nodes.push(node); - nodes.last().unwrap().start().unwrap(); - proxies.push(nodes.last().unwrap().start_proxy(wal_acceptors.clone())); + for tli in timelines { + let node = compute_cplane.new_test_node(tli); + nodes.push(node.clone()); + node.start().unwrap(); + node.start_proxy(&wal_acceptors); } // create schema @@ -111,7 +129,7 @@ fn test_acceptors_restarts() { node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); let mut failed_node: Option = None; // check basic work with table @@ -172,7 +190,7 @@ fn test_acceptors_unavailability() { node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( @@ -250,7 +268,7 @@ fn test_race_conditions() { node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b98cca4ca1..10336d84f5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use std::io; use std::process::exit; use std::thread; use std::fs::{File, OpenOptions}; +use std::path::PathBuf; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -101,11 +102,11 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { if conf.daemonize { info!("daemonizing..."); - let repodir = zenith_repo_dir(); + let repodir = PathBuf::from(zenith_repo_dir()); // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fprintf's or backtraces. - let log_filename = repodir.clone() + "pageserver.log"; + let log_filename = repodir.join("pageserver.log"); let stdout = OpenOptions::new() .create(true) .append(true) @@ -118,7 +119,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { .with_context(|| format!("failed to open {:?}", &log_filename))?; let daemonize = Daemonize::new() - .pid_file(repodir.clone() + "/pageserver.pid") + .pid_file(repodir.clone().join("pageserver.pid")) .working_directory(repodir) .stdout(stdout) .stderr(stderr); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3005e5e095..c9b547896c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -37,6 +37,16 @@ impl ZTimelineId { ZTimelineId(b) } + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId { + let mut arr = [0u8; 16]; + buf.copy_to_slice(&mut arr); + ZTimelineId::from(arr) + } + + pub fn as_arr(&self) -> [u8; 16] { + self.0 + } + pub fn to_str(self: &ZTimelineId) -> String { hex::encode(self.0) } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 20b3460d8c..edeba3b21f 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -537,7 +537,7 @@ impl PageCache { self.valid_lsn_condvar.notify_all(); self.last_valid_lsn.store(lsn, Ordering::Relaxed); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_record_lsn.store(lsn, Ordering::Relaxed); } // diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index cc972f713e..9240f2f657 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -419,13 +419,18 @@ impl FeMessage { pub fn thread_main(conf: &PageServerConf) { // Create a new thread pool // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); + // FIXME: It would be nice to keep this single-threaded for debugging purposes, + // but that currently leads to a deadlock: if a GetPage@LSN request arrives + // for an LSN that hasn't been received yet, the thread gets stuck waiting for + // the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e + // we haven't received a "callmemaybe" request yet to tell us where to get the + // WAL, we will not have a thread available to process the "callmemaybe" + // request when it does arrive. Using a thread pool alleviates the problem so + // that it doesn't happen in the tests anymore, but in principle it could still + // happen if we receive enough GetPage@LSN requests to consume all of the + // available threads. + //let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap(); + let runtime = runtime::Runtime::new().unwrap(); info!("Starting page server on {}", conf.listen_addr); diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index d8ec810f36..22ab546d5e 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -114,7 +114,7 @@ impl WalStreamDecoder { let hdr = self.decode_XLogLongPageHeaderData(); if hdr.std.xlp_pageaddr != self.lsn { - return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + return Err(WalDecodeError::new(&format!("invalid xlog segment header at {:X}/{:X}", self.lsn >> 32, self.lsn & 0xffffffff))); } @@ -131,9 +131,9 @@ impl WalStreamDecoder { let hdr = self.decode_XLogPageHeaderData(); if hdr.xlp_pageaddr != self.lsn { - return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}: {:?}", self.lsn >> 32, - self.lsn & 0xffffffff))); + self.lsn & 0xffffffff, hdr))); } // TODO: verify the remaining fields in the header diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 3f8fcd8722..23af8c2ee3 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -217,7 +217,7 @@ async fn walreceiver_main(conf: &PageServerConf, timelineid: ZTimelineId, wal_pr // Now that this record has been handled, let the page cache know that // it is up-to-date to this LSN - pcache.advance_last_valid_lsn(lsn); + pcache.advance_last_record_lsn(lsn); } else { break; } diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index 00576f055e..38a32bb730 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -10,15 +10,14 @@ use std::thread; use std::{fs::File, fs::OpenOptions}; use clap::{App, Arg}; +use anyhow::Result; use slog::Drain; -use pageserver::ZTimelineId; - use walkeeper::wal_service; use walkeeper::WalAcceptorConf; -fn main() -> Result<(), io::Error> { +fn main() -> Result<()> { let arg_matches = App::new("Zenith wal_acceptor") .about("Store WAL stream to local file system and push it to WAL receivers") .arg( @@ -29,10 +28,11 @@ fn main() -> Result<(), io::Error> { .help("Path to the WAL acceptor data directory"), ) .arg( - Arg::with_name("timelineid") - .long("timelineid") + Arg::with_name("systemid") + .long("systemid") .takes_value(true) - .help("zenith timeline id"), + .required(true) + .help("PostgreSQL system id, from pg_control"), ) .arg( Arg::with_name("listen") @@ -64,21 +64,23 @@ fn main() -> Result<(), io::Error> { ) .get_matches(); + let systemid_str = arg_matches.value_of("systemid").unwrap(); + let systemid = u64::from_str_radix(systemid_str, 10)?; + let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), - timelineid: ZTimelineId::from([0u8; 16]), + systemid: systemid, daemonize: false, no_sync: false, pageserver_addr: None, - listen_addr: "127.0.0.1:5454".parse().unwrap(), + listen_addr: "127.0.0.1:5454".parse()?, }; if let Some(dir) = arg_matches.value_of("datadir") { conf.data_dir = PathBuf::from(dir); - } - if let Some(timelineid_str) = arg_matches.value_of("timelineid") { - conf.timelineid = ZTimelineId::from_str(timelineid_str).unwrap(); + // change into the data directory. + std::env::set_current_dir(&conf.data_dir)?; } if arg_matches.is_present("no-sync") { @@ -100,7 +102,7 @@ fn main() -> Result<(), io::Error> { start_wal_acceptor(conf) } -fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { +fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { // Initialize logger let _scope_guard = init_logging(&conf)?; let _log_guard = slog_stdlog::init().unwrap(); @@ -115,16 +117,16 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { let stdout = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("wal_acceptor.log")) + .open("wal_acceptor.log") .unwrap(); let stderr = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("wal_acceptor.log")) + .open("wal_acceptor.log") .unwrap(); let daemonize = Daemonize::new() - .pid_file(conf.data_dir.join("wal_acceptor.pid")) + .pid_file("wal_acceptor.pid") .working_directory(Path::new(".")) .stdout(stdout) .stderr(stderr); diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 5f2f557b49..784ab730b6 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -6,12 +6,12 @@ mod pq_protocol; pub mod wal_service; pub mod xlog_utils; -use pageserver::ZTimelineId; +use crate::pq_protocol::SystemId; #[derive(Debug, Clone)] pub struct WalAcceptorConf { pub data_dir: PathBuf, - pub timelineid: ZTimelineId, + pub systemid: SystemId, pub daemonize: bool, pub no_sync: bool, pub listen_addr: SocketAddr, diff --git a/walkeeper/src/pq_protocol.rs b/walkeeper/src/pq_protocol.rs index 299b830d5e..8179a734b9 100644 --- a/walkeeper/src/pq_protocol.rs +++ b/walkeeper/src/pq_protocol.rs @@ -1,5 +1,6 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use pageserver::ZTimelineId; use std::io; use std::str; @@ -37,7 +38,7 @@ pub enum BeMessage<'a> { pub struct FeStartupMessage { pub version: u32, pub kind: StartupRequestCode, - pub system_id: SystemId, + pub timelineid: ZTimelineId, } #[derive(Debug)] @@ -83,26 +84,33 @@ impl FeStartupMessage { let params_str = str::from_utf8(¶ms_bytes).unwrap(); let params = params_str.split('\0'); let mut options = false; - let mut system_id: u64 = 0; + let mut timelineid: Option = None; for p in params { if p == "options" { options = true; } else if options { for opt in p.split(' ') { - if opt.starts_with("system.id=") { - system_id = opt[10..].parse::().unwrap(); + if opt.starts_with("ztimelineid=") { + // FIXME: rethrow parsing error, don't unwrap + timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap()); break; } } break; } } + if timelineid.is_none() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "timelineid is required", + )); + } buf.advance(len as usize); Ok(Some(FeMessage::StartupMessage(FeStartupMessage { version, kind, - system_id, + timelineid: timelineid.unwrap(), }))) } } diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 1a8f764598..3dc873e27b 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -33,6 +33,7 @@ use tokio_postgres::{connect, Error, NoTls}; use crate::pq_protocol::*; use crate::xlog_utils::*; use crate::WalAcceptorConf; +use pageserver::ZTimelineId; type FullTransactionId = u64; @@ -64,7 +65,8 @@ struct ServerInfo { protocol_version: u32, /* proxy-safekeeper protocol version */ pg_version: u32, /* Postgres server version */ node_id: NodeId, - system_id: SystemId, /* Postgres system identifier */ + system_id: SystemId, + timeline_id: ZTimelineId, /* Zenith timelineid */ wal_end: XLogRecPtr, timeline: TimeLineID, wal_seg_size: u32, @@ -146,8 +148,8 @@ struct SharedState { * Database instance (tenant) */ #[derive(Debug)] -pub struct System { - id: SystemId, +pub struct Timeline { + timelineid: ZTimelineId, mutex: Mutex, cond: Notify, /* conditional variable used to notify wal senders */ } @@ -157,7 +159,7 @@ pub struct System { */ #[derive(Debug)] struct Connection { - system: Option>, + timeline: Option>, stream: TcpStream, /* Postgres connection */ inbuf: BytesMut, /* input buffer */ outbuf: BytesMut, /* output buffer */ @@ -211,6 +213,7 @@ impl Serializer for ServerInfo { buf.put_u32_le(self.pg_version); self.node_id.pack(buf); buf.put_u64_le(self.system_id); + buf.put_slice(&self.timeline_id.as_arr()); buf.put_u64_le(self.wal_end); buf.put_u32_le(self.timeline); buf.put_u32_le(self.wal_seg_size); @@ -221,6 +224,7 @@ impl Serializer for ServerInfo { pg_version: buf.get_u32_le(), node_id: NodeId::unpack(buf), system_id: buf.get_u64_le(), + timeline_id: ZTimelineId::get_from_buf(buf), wal_end: buf.get_u64_le(), timeline: buf.get_u32_le(), wal_seg_size: buf.get_u32_le(), @@ -278,6 +282,7 @@ impl SafeKeeperInfo { pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ node_id: NodeId { term: 0, uuid: 0 }, system_id: 0, /* Postgres system identifier */ + timeline_id: ZTimelineId::from([0u8; 16]), wal_end: 0, timeline: 0, wal_seg_size: 0, @@ -349,7 +354,7 @@ impl Serializer for SafeKeeperResponse { } lazy_static! { - pub static ref SYSTEMS: Mutex>> = Mutex::new(HashMap::new()); + pub static ref TIMELINES: Mutex>> = Mutex::new(HashMap::new()); } pub fn thread_main(conf: WalAcceptorConf) { @@ -389,8 +394,8 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> { } } -impl System { - pub fn new(id: SystemId) -> System { +impl Timeline { + pub fn new(timelineid: ZTimelineId) -> Timeline { let shared_state = SharedState { commit_lsn: 0, info: SafeKeeperInfo::new(), @@ -401,8 +406,8 @@ impl System { catalog_xmin: u64::MAX, }, }; - System { - id, + Timeline { + timelineid, mutex: Mutex::new(shared_state), cond: Notify::new(), } @@ -444,11 +449,20 @@ impl System { } // Load and lock control file (prevent running more than one instance of safekeeper) - fn load_control_file(&self, conf: &WalAcceptorConf) { + fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> { + + let mut shared_state = self.mutex.lock().unwrap(); + + if shared_state.control_file.is_some() { + info!("control file for timeline {} is already open", self.timelineid); + return Ok(()); + } + let control_file_path = conf .data_dir - .join(self.id.to_string()) + .join(self.timelineid.to_string()) .join(CONTROL_FILE_NAME); + info!("loading control file {}", control_file_path.display()); match OpenOptions::new() .read(true) .write(true) @@ -460,13 +474,12 @@ impl System { match file.try_lock_exclusive() { Ok(()) => {} Err(e) => { - panic!( + io_error!( "Control file {:?} is locked by some other process: {}", &control_file_path, e ); } } - let mut shared_state = self.mutex.lock().unwrap(); shared_state.control_file = Some(file); const SIZE: usize = mem::size_of::(); @@ -483,10 +496,10 @@ impl System { let my_info = SafeKeeperInfo::unpack(&mut input); if my_info.magic != SK_MAGIC { - panic!("Invalid control file magic: {}", my_info.magic); + io_error!("Invalid control file magic: {}", my_info.magic); } if my_info.format_version != SK_FORMAT_VERSION { - panic!( + io_error!( "Incompatible format version: {} vs. {}", my_info.format_version, SK_FORMAT_VERSION ); @@ -501,6 +514,7 @@ impl System { ); } } + Ok(()) } fn save_control_file(&self, sync: bool) -> Result<()> { @@ -521,7 +535,7 @@ impl System { impl Connection { pub fn new(socket: TcpStream, conf: &WalAcceptorConf) -> Connection { Connection { - system: None, + timeline: None, stream: socket, inbuf: BytesMut::with_capacity(10 * 1024), outbuf: BytesMut::with_capacity(10 * 1024), @@ -530,8 +544,8 @@ impl Connection { } } - fn system(&self) -> Arc { - self.system.as_ref().unwrap().clone() + fn timeline(&self) -> Arc { + self.timeline.as_ref().unwrap().clone() } async fn run(&mut self) -> Result<()> { @@ -563,12 +577,13 @@ impl Connection { "no_user", ); let callme = format!( - "callmemaybe {} host={} port={} replication=1 options='-c system.id={}'", - self.conf.timelineid, + "callmemaybe {} host={} port={} options='-c ztimelineid={}'", + self.timeline().timelineid, self.conf.listen_addr.ip(), self.conf.listen_addr.port(), - self.system().get_info().server.system_id, + self.timeline().timelineid ); + info!("requesting page server to connect to us: start {} {}", ps_connstr, callme); let (client, connection) = connect(&ps_connstr, NoTls).await?; // The connection object performs the actual communication with the database, @@ -583,22 +598,15 @@ impl Connection { Ok(()) } - fn set_system(&mut self, id: SystemId) -> Result<()> { - let mut systems = SYSTEMS.lock().unwrap(); - if id == 0 { - // non-multitenant configuration: just a single instance - if let Some(system) = systems.values().next() { - self.system = Some(system.clone()); - return Ok(()); - } - io_error!("No active instances"); + fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> { + let mut timelines = TIMELINES.lock().unwrap(); + if !timelines.contains_key(&timelineid) { + let timeline_dir = timelineid.to_str(); + info!("creating timeline dir {}", &timeline_dir); + fs::create_dir_all(&timeline_dir)?; + timelines.insert(timelineid, Arc::new(Timeline::new(timelineid))); } - if !systems.contains_key(&id) { - let system_dir = self.conf.data_dir.join(id.to_string()); - fs::create_dir_all(system_dir)?; - systems.insert(id, Arc::new(System::new(id))); - } - self.system = Some(systems.get(&id).unwrap().clone()); + self.timeline = Some(timelines.get(&timelineid).unwrap().clone()); Ok(()) } @@ -607,14 +615,16 @@ impl Connection { // Receive information about server let server_info = self.read_req::().await?; info!( - "Start handshake with wal_proposer {} sysid {}", + "Start handshake with wal_proposer {} sysid {} timeline {}", self.stream.peer_addr()?, - server_info.system_id + server_info.system_id, + server_info.timeline_id, ); - self.set_system(server_info.system_id)?; - self.system().load_control_file(&self.conf); + // FIXME: also check that the system identifier matches + self.set_timeline(server_info.timeline_id)?; + self.timeline().load_control_file(&self.conf)?; - let mut my_info = self.system().get_info(); + let mut my_info = self.timeline().get_info(); /* Check protocol compatibility */ if server_info.protocol_version != SK_PROTOCOL_VERSION { @@ -663,9 +673,9 @@ impl Connection { ); } my_info.server.node_id = prop.node_id; - self.system().set_info(&my_info); + self.timeline().set_info(&my_info); /* Need to persist our vote first */ - self.system().save_control_file(true)?; + self.timeline().save_control_file(true)?; let mut flushed_restart_lsn: XLogRecPtr = 0; let wal_seg_size = server_info.wal_seg_size as usize; @@ -684,8 +694,8 @@ impl Connection { } info!( - "Start streaming from server {} address {:?}", - server_info.system_id, + "Start streaming from timeline {} address {:?}", + server_info.timeline_id, self.stream.peer_addr()? ); @@ -707,6 +717,9 @@ impl Connection { let rec_size = (end_pos - start_pos) as usize; assert!(rec_size <= MAX_SEND_SIZE); + debug!("received for {} bytes between {:X}/{:X} and {:X}/{:X}", + rec_size, start_pos >> 32, start_pos & 0xffffffff, end_pos >> 32, end_pos & 0xffffffff); + /* Receive message body */ self.inbuf.resize(rec_size, 0u8); self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?; @@ -737,7 +750,7 @@ impl Connection { * when restart_lsn delta exceeds WAL segment size. */ sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn; - self.system().save_control_file(sync_control_file)?; + self.timeline().save_control_file(sync_control_file)?; if sync_control_file { flushed_restart_lsn = my_info.restart_lsn; @@ -748,7 +761,7 @@ impl Connection { let resp = SafeKeeperResponse { epoch: my_info.epoch, flush_lsn: end_pos, - hs_feedback: self.system().get_hs_feedback(), + hs_feedback: self.timeline().get_hs_feedback(), }; self.start_sending(); resp.pack(&mut self.outbuf); @@ -758,7 +771,7 @@ impl Connection { * Ping wal sender that new data is available. * FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper. */ - self.system() + self.timeline() .notify_wal_senders(min(req.commit_lsn, end_pos)); } Ok(()) @@ -809,7 +822,7 @@ impl Connection { } // - // Send WAL to replica or WAL sender using standard libpq replication protocol + // Send WAL to replica or WAL receiver using standard libpq replication protocol // async fn send_wal(&mut self) -> Result<()> { info!("WAL sender to {:?} is started", self.stream.peer_addr()?); @@ -830,7 +843,7 @@ impl Connection { BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); self.send().await?; self.init_done = true; - self.set_system(m.system_id)?; + self.set_timeline(m.timelineid)?; } StartupRequestCode::Cancel => return Ok(()), } @@ -863,7 +876,7 @@ impl Connection { let (start_pos, timeline) = self.find_end_of_wal(false); let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32); let tli = timeline.to_string(); - let sysid = self.system().get_info().server.system_id.to_string(); + let sysid = self.timeline().get_info().server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli_bytes = tli.as_bytes(); let sysid_bytes = sysid.as_bytes(); @@ -919,7 +932,7 @@ impl Connection { } else { 0 }; - let wal_seg_size = self.system().get_info().server.wal_seg_size as usize; + let wal_seg_size = self.timeline().get_info().server.wal_seg_size as usize; if wal_seg_size == 0 { io_error!("Can not start replication before connecting to wal_proposer"); } @@ -937,15 +950,6 @@ impl Connection { BeMessage::write(&mut self.outbuf, &BeMessage::Copy); self.send().await?; - /* - * Always start streaming at the beginning of a segment - * - * FIXME: It is common practice to start streaming at the beginning of - * the segment, but it should be up to the client to decide that. We - * shouldn't enforce that here. - */ - start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64; - let mut end_pos: XLogRecPtr; let mut commit_lsn: XLogRecPtr; let mut wal_file: Option = None; @@ -962,19 +966,18 @@ impl Connection { end_pos = stop_pos; } else { /* normal mode */ + let timeline = self.timeline(); loop { // Rust doesn't allow to grab async result from mutex scope - let system = self.system(); - let notified = system.cond.notified(); { - let shared_state = system.mutex.lock().unwrap(); + let shared_state = timeline.mutex.lock().unwrap(); commit_lsn = shared_state.commit_lsn; if start_pos < commit_lsn { end_pos = commit_lsn; break; } } - notified.await; + timeline.cond.notified().await; } } if end_pos == END_REPLICATION_MARKER { @@ -985,7 +988,7 @@ impl Connection { Ok(0) => break, Ok(_) => match self.parse_message()? { Some(FeMessage::CopyData(m)) => self - .system() + .timeline() .add_hs_feedback(HotStandbyFeedback::parse(&m.body)), _ => {} }, @@ -1006,7 +1009,7 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone() + ".partial"); if let Ok(opened_file) = File::open(&wal_file_path) { file = opened_file; @@ -1014,7 +1017,7 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name); match File::open(&wal_file_path) { Ok(opened_file) => file = opened_file, @@ -1036,6 +1039,8 @@ impl Connection { let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size; let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE; let data_end = data_start + send_size; + + file.seek(SeekFrom::Start(xlogoff as u64))?; file.read_exact(&mut self.outbuf[data_start..data_end])?; self.outbuf[0] = b'd'; BigEndian::write_u32( @@ -1050,6 +1055,9 @@ impl Connection { self.stream.write_all(&self.outbuf[0..msg_size]).await?; start_pos += send_size as u64; + debug!("Sent WAL to page server up to {:X}/{:>08X}", + (end_pos>>32) as u32, end_pos as u32); + if XLogSegmentOffset(start_pos, wal_seg_size) != 0 { wal_file = Some(file); } @@ -1104,12 +1112,12 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_str()) .join(wal_file_name.clone()); let wal_file_partial_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_str()) .join(wal_file_name.clone() + ".partial"); { @@ -1172,7 +1180,7 @@ impl Connection { fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) { find_end_of_wal( &self.conf.data_dir, - self.system().get_info().server.wal_seg_size as usize, + self.timeline().get_info().server.wal_seg_size as usize, precise, ) } From d047a3abf7317a65560572d97b79c84a1665488c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Apr 2021 17:57:06 +0300 Subject: [PATCH 09/15] Fixes, per Eric's and Konstantin's comments --- control_plane/src/compute.rs | 77 +++++++++++----------- control_plane/src/local_env.rs | 99 ++++++++++++++-------------- control_plane/src/storage.rs | 1 + pageserver/src/lib.rs | 21 +++--- pageserver/src/page_service.rs | 9 +-- pageserver/src/restore_local_repo.rs | 73 +++++++++----------- pageserver/src/waldecoder.rs | 53 +++++---------- walkeeper/src/bin/wal_acceptor.rs | 2 +- walkeeper/src/pq_protocol.rs | 1 + walkeeper/src/wal_service.rs | 9 ++- zenith/src/main.rs | 28 ++++---- 11 files changed, 171 insertions(+), 202 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index c2b29e7397..91ad2ba805 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -73,8 +73,8 @@ impl ComputeControlPlane { } } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + /// Connect to a page server, get base backup, and untar it to initialize a + /// new data directory pub fn new_from_page_server(&mut self, is_test: bool, timelineid: ZTimelineId) -> Result> { let node_id = self.nodes.len() as u32 + 1; @@ -215,7 +215,7 @@ impl PostgresNode { println!( "Extracting base backup to create postgres instance: path={} port={}", - pgdata.to_str().unwrap(), + pgdata.display(), self.address.port() ); @@ -225,66 +225,64 @@ impl PostgresNode { } let sql = format!("basebackup {}", self.timelineid); - let mut client = self.pageserver.page_server_psql_client()?; - println!("connected to page server"); + let mut client = self.pageserver.page_server_psql_client().with_context(|| "connecting to page erver failed")?; - fs::create_dir_all(&pgdata)?; - fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).unwrap(); + fs::create_dir_all(&pgdata) + .with_context(|| format!("could not create data directory {}", pgdata.display()))?; + fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)) + .with_context(|| format!("could not set permissions in data directory {}", pgdata.display()))?; - // Also create pg_wal directory, it's not included in the tarball - // FIXME: actually, it is currently. + // FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive. + // But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that + // we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here. //fs::create_dir_all(pgdata.join("pg_wal"))?; - let mut copyreader = client.copy_out(sql.as_str())?; + let mut copyreader = client.copy_out(sql.as_str()) + .with_context(|| "page server 'basebackup' command failed")?; // FIXME: Currently, we slurp the whole tarball into memory, and then extract it, // but we really should do this: //let mut ar = tar::Archive::new(copyreader); let mut buf = vec![]; - copyreader.read_to_end(&mut buf)?; - println!("got tarball of size {}", buf.len()); + copyreader.read_to_end(&mut buf) + .with_context(|| "reading base backup from page server failed")?; let mut ar = tar::Archive::new(buf.as_slice()); - ar.unpack(&pgdata)?; + ar.unpack(&pgdata) + .with_context(|| "extracting page backup failed")?; // listen for selected port self.append_conf( "postgresql.conf", - format!( + &format!( "max_wal_senders = 10\n\ - max_replication_slots = 10\n\ - hot_standby = on\n\ - shared_buffers = 1MB\n\ - max_connections = 100\n\ - wal_level = replica\n\ - listen_addresses = '{address}'\n\ - port = {port}\n", + max_replication_slots = 10\n\ + hot_standby = on\n\ + shared_buffers = 1MB\n\ + max_connections = 100\n\ + wal_level = replica\n\ + listen_addresses = '{address}'\n\ + port = {port}\n", address = self.address.ip(), port = self.address.port() - ) - .as_str(), - ); + )); // Never clean up old WAL. TODO: We should use a replication // slot or something proper, to prevent the compute node // from removing WAL that hasn't been streamed to the safekeepr or // page server yet. But this will do for now. self.append_conf("postgresql.conf", - format!("wal_keep_size='10TB'\n") - .as_str(), - ); + &format!("wal_keep_size='10TB'\n")); // Connect it to the page server. // Configure that node to take pages from pageserver self.append_conf("postgresql.conf", - format!("page_server_connstring = 'host={} port={}'\n\ - zenith_timeline='{}'\n", - self.pageserver.address().ip(), - self.pageserver.address().port(), - self.timelineid, - ) - .as_str(), - ); + &format!("page_server_connstring = 'host={} port={}'\n\ + zenith_timeline='{}'\n", + self.pageserver.address().ip(), + self.pageserver.address().port(), + self.timelineid + )); Ok(()) } @@ -317,6 +315,7 @@ impl PostgresNode { fn pg_ctl(&self, args: &[&str]) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl = Command::new(pg_ctl_path) .args( [ @@ -332,13 +331,11 @@ impl PostgresNode { ) .env_clear() .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status()?; - + .status().with_context(|| "pg_ctl failed")?; if !pg_ctl.success() { anyhow::bail!("pg_ctl failed"); - } else { - Ok(()) } + Ok(()) } pub fn start(&self) -> Result<()> { @@ -404,7 +401,7 @@ impl PostgresNode { pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode { let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy"); match Command::new(proxy_path.as_path()) - .args(&["--ztimelineid", &self.timelineid.to_str()]) + .args(&["--ztimelineid", &self.timelineid.to_string()]) .args(&["-s", wal_acceptors]) .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5ac5cb8fd2..e2c310f733 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,7 +12,6 @@ use bytes::Bytes; use rand::Rng; use anyhow::Context; -use hex; use serde_derive::{Deserialize, Serialize}; use anyhow::Result; @@ -53,10 +52,10 @@ impl LocalEnv { } } -fn zenith_repo_dir() -> String { +fn zenith_repo_dir() -> PathBuf { // Find repository path match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => String::from(val.to_str().unwrap()), + Some(val) => PathBuf::from(val.to_str().unwrap()), None => ".zenith".into(), } } @@ -66,7 +65,7 @@ fn zenith_repo_dir() -> String { // pub fn init() -> Result<()> { // check if config already exists - let repo_path = PathBuf::from(zenith_repo_dir()); + let repo_path = zenith_repo_dir(); if repo_path.exists() { anyhow::bail!("{} already exists. Perhaps already initialized?", repo_path.to_str().unwrap()); @@ -113,19 +112,19 @@ pub fn init() -> Result<()> { pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> { - let repopath = String::from(local_env.repo_path.to_str().unwrap()); - fs::create_dir(&repopath).with_context(|| format!("could not create directory {}", repopath))?; - fs::create_dir(repopath.clone() + "/pgdatadirs")?; - fs::create_dir(repopath.clone() + "/timelines")?; - fs::create_dir(repopath.clone() + "/refs")?; - fs::create_dir(repopath.clone() + "/refs/branches")?; - fs::create_dir(repopath.clone() + "/refs/tags")?; - println!("created directory structure in {}", repopath); + let repopath = &local_env.repo_path; + fs::create_dir(&repopath).with_context(|| format!("could not create directory {}", repopath.display()))?; + fs::create_dir(repopath.join("pgdatadirs"))?; + fs::create_dir(repopath.join("timelines"))?; + fs::create_dir(repopath.join("refs"))?; + fs::create_dir(repopath.join("refs").join("branches"))?; + fs::create_dir(repopath.join("refs").join("tags"))?; + println!("created directory structure in {}", repopath.display()); // Create initial timeline let tli = create_timeline(&local_env, None)?; - let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); - println!("created initial timeline {}", timelinedir); + let timelinedir = repopath.join("timelines").join(tli.to_string()); + println!("created initial timeline {}", timelinedir.display()); // Run initdb // @@ -151,7 +150,7 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> let lsnstr = format!("{:016X}", lsn); // Move the initial WAL file - fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.clone() + "/wal/000000010000000000000001.partial")?; + fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.join("wal").join("000000010000000000000001.partial"))?; println!("moved initial WAL file"); // Remove pg_wal @@ -161,13 +160,13 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> force_crash_recovery(&PathBuf::from("tmp"))?; println!("updated pg_control"); - let target = timelinedir.clone() + "/snapshots/" + &lsnstr; + let target = timelinedir.join("snapshots").join(&lsnstr); fs::rename("tmp", &target)?; - println!("moved 'tmp' to {}", &target); + println!("moved 'tmp' to {}", target.display()); // Create 'main' branch to refer to the initial timeline - let data = hex::encode(tli); - fs::write(repopath.clone() + "/refs/branches/main", data)?; + let data = tli.to_string(); + fs::write(repopath.join("refs").join("branches").join("main"), data)?; println!("created main branch"); // Also update the system id in the LocalEnv @@ -175,9 +174,9 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> // write config let toml = toml::to_string(&local_env)?; - fs::write(repopath.clone() + "/config", toml)?; + fs::write(repopath.join("config"), toml)?; - println!("new zenith repository was created in {}", &repopath); + println!("new zenith repository was created in {}", repopath.display()); Ok(()) } @@ -195,9 +194,7 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> fn force_crash_recovery(datadir: &Path) -> Result<()> { // Read in the control file - let mut controlfilepath = datadir.to_path_buf(); - controlfilepath.push("global"); - controlfilepath.push("pg_control"); + let controlfilepath = datadir.to_path_buf().join("global").join("pg_control"); let mut controlfile = postgres_ffi::decode_pg_control( Bytes::from(fs::read(controlfilepath.as_path())?))?; @@ -258,28 +255,29 @@ pub struct PointInTime { pub lsn: u64 } -fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result<[u8; 16]> { - let repopath = String::from(local_env.repo_path.to_str().unwrap()); +fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result { + let repopath = &local_env.repo_path; // Create initial timeline - let mut tli = [0u8; 16]; - rand::thread_rng().fill(&mut tli); + let mut tli_buf = [0u8; 16]; + rand::thread_rng().fill(&mut tli_buf); + let timelineid = ZTimelineId::from(tli_buf); - let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + let timelinedir = repopath.join("timelines").join(timelineid.to_string()); - fs::create_dir(timelinedir.clone())?; - fs::create_dir(timelinedir.clone() + "/snapshots")?; - fs::create_dir(timelinedir.clone() + "/wal")?; + fs::create_dir(&timelinedir)?; + fs::create_dir(&timelinedir.join("snapshots"))?; + fs::create_dir(&timelinedir.join("wal"))?; if let Some(ancestor) = ancestor { let data = format!("{}@{:X}/{:X}", - hex::encode(ancestor.timelineid.to_str()), + ancestor.timelineid, ancestor.lsn >> 32, ancestor.lsn & 0xffffffff); - fs::write(timelinedir + "/ancestor", data)?; + fs::write(timelinedir.join("ancestor"), data)?; } - Ok(tli) + Ok(timelineid) } // Parse an LSN in the format used in filenames @@ -292,26 +290,26 @@ fn parse_lsn(s: &str) -> std::result::Result { // Create a new branch in the repository (for the "zenith branch" subcommand) pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointInTime) -> Result<()> { - let repopath = String::from(local_env.repo_path.to_str().unwrap()); + let repopath = &local_env.repo_path; // create a new timeline for it let newtli = create_timeline(local_env, Some(startpoint))?; - let newtimelinedir = format!("{}/timelines/{}", repopath, &hex::encode(newtli)); + let newtimelinedir = repopath.join("timelines").join(newtli.to_string()); - let data = hex::encode(newtli); - fs::write(format!("{}/refs/branches/{}", repopath, branchname), data)?; + let data = newtli.to_string(); + fs::write(repopath.join("refs").join("branches").join(branchname), data)?; // Copy the latest snapshot (TODO: before the startpoint) and all WAL // TODO: be smarter and avoid the copying... let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?; let copy_opts = fs_extra::dir::CopyOptions::new(); - fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.clone() + "/snapshots", ©_opts)?; + fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?; - let oldtimelinedir = format!("{}/timelines/{}", &repopath, startpoint.timelineid.to_str()); + let oldtimelinedir = repopath.join("timelines").join(startpoint.timelineid.to_string()); let mut copy_opts = fs_extra::dir::CopyOptions::new(); copy_opts.content_only = true; - fs_extra::dir::copy(oldtimelinedir + "/wal/", - newtimelinedir.clone() + "/wal", + fs_extra::dir::copy(oldtimelinedir.join("wal"), + newtimelinedir.join("wal"), ©_opts)?; Ok(()) @@ -319,8 +317,8 @@ pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointIn // Find the end of valid WAL in a wal directory pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result { - let repopath = String::from(local_env.repo_path.to_str().unwrap()); - let waldir = PathBuf::from(format!("{}/timelines/{}/wal", repopath, timeline.to_str())); + let repopath = &local_env.repo_path; + let waldir = repopath.join("timelines").join(timeline.to_string()).join("wal"); let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true); @@ -329,15 +327,14 @@ pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result Result<(u64, PathBuf)> { - let repopath = String::from(local_env.repo_path.to_str().unwrap()); + let repopath = &local_env.repo_path; - let timelinedir = repopath + "/timelines/" + &timeline.to_str(); - let snapshotsdir = timelinedir.clone() + "/snapshots"; - let paths = fs::read_dir(&snapshotsdir).unwrap(); + let snapshotsdir = repopath.join("timelines").join(timeline.to_string()).join("snapshots"); + let paths = fs::read_dir(&snapshotsdir)?; let mut maxsnapshot: u64 = 0; let mut snapshotdir: Option = None; for path in paths { - let path = path.unwrap(); + let path = path?; let filename = path.file_name().to_str().unwrap().to_owned(); if let Ok(lsn) = parse_lsn(&filename) { maxsnapshot = std::cmp::max(lsn, maxsnapshot); @@ -346,7 +343,7 @@ fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<( } if maxsnapshot == 0 { // TODO: check ancestor timeline - anyhow::bail!("no snapshot found in {}", snapshotsdir); + anyhow::bail!("no snapshot found in {}", snapshotsdir.display()); } Ok((maxsnapshot, snapshotdir.unwrap())) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index f2dbf8dc1a..5e5e2bff51 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -4,6 +4,7 @@ use std::net::SocketAddr; use std::net::TcpStream; use std::path::{Path, PathBuf}; use std::process::Command; +use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c9b547896c..c738f90f41 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,4 +1,6 @@ use std::net::SocketAddr; +use std::str::FromStr; +use std::fmt; pub mod page_cache; pub mod page_service; @@ -23,9 +25,10 @@ pub struct PageServerConf { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct ZTimelineId([u8; 16]); -impl ZTimelineId { +impl FromStr for ZTimelineId { + type Err = hex::FromHexError; - pub fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { let timelineid = hex::decode(s)?; let mut buf: [u8; 16] = [0u8; 16]; @@ -33,6 +36,9 @@ impl ZTimelineId { Ok(ZTimelineId(buf)) } +} + +impl ZTimelineId { pub fn from(b: [u8; 16]) -> ZTimelineId { ZTimelineId(b) } @@ -46,14 +52,11 @@ impl ZTimelineId { pub fn as_arr(&self) -> [u8; 16] { self.0 } - - pub fn to_str(self: &ZTimelineId) -> String { - hex::encode(self.0) - } } -impl std::fmt::Display for ZTimelineId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.to_str()) +impl fmt::Display for ZTimelineId { + + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&hex::encode(self.0)) } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9240f2f657..6dac213be6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -15,6 +15,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use log::*; use std::io; use std::thread; +use std::str::FromStr; use std::sync::Arc; use regex::Regex; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; @@ -247,14 +248,13 @@ impl FeDescribeMessage { } */ - if kind != 0x53 { // 'S' + if kind != b'S' { return Err(io::Error::new( io::ErrorKind::InvalidInput, "only prepared statmement Describe is implemented", )); } - Ok(FeMessage::Describe(FeDescribeMessage {kind})) } } @@ -262,7 +262,8 @@ impl FeDescribeMessage { // we only support unnamed prepared stmt or portal #[derive(Debug)] struct FeExecuteMessage { - maxrows: i32// max # of rows + /// max # of rows + maxrows: i32 } impl FeExecuteMessage { @@ -469,7 +470,7 @@ impl Connection { buffer: BytesMut::with_capacity(10 * 1024), init_done: false, conf, - runtime: runtime.clone(), + runtime: Arc::clone(runtime), } } diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index c53c04ef92..e7418ac09d 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -45,19 +45,20 @@ const GLOBALTABLESPACE_OID: u32 = 1664; // pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId) -> Result<()> { - let timelinepath = PathBuf::from("timelines").join(&timeline.to_str()); + let timelinepath = PathBuf::from("timelines").join(timeline.to_string()); if !timelinepath.exists() { anyhow::bail!("timeline {} does not exist in the page server's repository"); } // Scan .zenith/timelines//snapshots - let snapshotspath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots"; + let snapshotspath = PathBuf::from("timelines").join(timeline.to_string()).join("snapshots"); let mut last_snapshot_lsn: u64 = 0; for direntry in fs::read_dir(&snapshotspath).unwrap() { - let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + let direntry = direntry?; + let filename = direntry.file_name().to_str().unwrap().to_owned(); let lsn = u64::from_str_radix(&filename, 16)?; last_snapshot_lsn = max(lsn, last_snapshot_lsn); @@ -67,7 +68,7 @@ pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTi } if last_snapshot_lsn == 0 { - error!("could not find valid snapshot in {}", &snapshotspath); + error!("could not find valid snapshot in {}", snapshotspath.display()); // TODO return error? } pcache.init_valid_lsn(last_snapshot_lsn); @@ -98,54 +99,42 @@ pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Re fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, snapshot: &str) -> Result<()> { - let snapshotpath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots/" + snapshot; + let snapshotpath = PathBuf::from("timelines").join(timeline.to_string()).join("snapshots").join(snapshot); // Scan 'global' - let paths = fs::read_dir(snapshotpath.clone() + "/global").unwrap(); + for direntry in fs::read_dir(snapshotpath.join("global"))? { + let direntry = direntry?; + match direntry.file_name().to_str() { + None => continue, - for direntry in paths { - let path = direntry.unwrap().path(); - let filename = path.file_name(); - if filename.is_none() { - continue; - } - let filename = filename.unwrap().to_str(); + // These special files appear in the snapshot, but are not needed by the page server + Some("pg_control") => continue, + Some("pg_filenode.map") => continue, - if filename == Some("pg_control") { - continue; + // Load any relation files into the page server + _ => restore_relfile(conf, pcache, timeline, snapshot, GLOBALTABLESPACE_OID, 0, &direntry.path())?, } - if filename == Some("pg_filenode.map") { - continue; - } - - restore_relfile(conf, pcache, timeline, snapshot, GLOBALTABLESPACE_OID, 0, &path)?; } - // Scan 'base' - let paths = fs::read_dir(snapshotpath.clone() + "/base").unwrap(); - for path in paths { - let path = path.unwrap(); - let filename = path.file_name().to_str().unwrap().to_owned(); + // Scan 'base'. It contains database dirs, the database OID is the filename. + // E.g. 'base/12345', where 12345 is the database OID. + for direntry in fs::read_dir(snapshotpath.join("base"))? { + let direntry = direntry?; - // Scan database dirs - let dboid = u32::from_str_radix(&filename, 10)?; + let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?; - let paths = fs::read_dir(path.path()).unwrap(); - for direntry in paths { - let path = direntry.unwrap().path(); - let filename = path.file_name(); - if filename.is_none() { - continue; - } - let filename = filename.unwrap().to_str(); - if filename == Some("PG_VERSION") { - continue; - } - if filename == Some("pg_filenode.map") { - continue; - } + for direntry in fs::read_dir(direntry.path())? { + let direntry = direntry?; + match direntry.file_name().to_str() { + None => continue, - restore_relfile(conf, pcache, timeline, snapshot, DEFAULTTABLESPACE_OID, dboid, &path)?; + // These special files appear in the snapshot, but are not needed by the page server + Some("PG_VERSION") => continue, + Some("pg_filenode.map") => continue, + + // Load any relation files into the page server + _ => restore_relfile(conf, pcache, timeline, snapshot, DEFAULTTABLESPACE_OID, dboid, &direntry.path())?, + } } } diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 22ab546d5e..8172b875da 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -1,10 +1,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use std::cmp::min; -use std::error::Error; -use std::fmt; - use log::*; +use std::cmp::min; +use thiserror::Error; const XLOG_BLCKSZ: u32 = 8192; @@ -54,28 +51,11 @@ pub struct WalStreamDecoder { } -#[derive(Debug, Clone)] +#[derive(Error, Debug, Clone)] +#[error("{msg} at {lsn}")] pub struct WalDecodeError { msg: String, -} - -impl Error for WalDecodeError { - fn description(&self) -> &str { - &self.msg - } -} -impl WalDecodeError { - fn new(msg: &str) -> WalDecodeError { - WalDecodeError { - msg: msg.to_string(), - } - } -} - -impl fmt::Display for WalDecodeError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "WAL decoding error: {}", self.msg) - } + lsn: u64 } // @@ -100,8 +80,14 @@ impl WalStreamDecoder { self.inputbuf.extend_from_slice(buf); } - // Returns a tuple: - // (end LSN, record) + /// Attempt to decode another WAL record from the input that has been fed to the + /// decoder so far. + /// + /// Returns one of the following: + /// Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself + /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function + /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. + /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { loop { // parse and verify page boundaries as we go @@ -114,9 +100,7 @@ impl WalStreamDecoder { let hdr = self.decode_XLogLongPageHeaderData(); if hdr.std.xlp_pageaddr != self.lsn { - return Err(WalDecodeError::new(&format!("invalid xlog segment header at {:X}/{:X}", - self.lsn >> 32, - self.lsn & 0xffffffff))); + return Err(WalDecodeError { msg: "invalid xlog segment header".into(), lsn: self.lsn }); } // TODO: verify the remaining fields in the header @@ -131,9 +115,7 @@ impl WalStreamDecoder { let hdr = self.decode_XLogPageHeaderData(); if hdr.xlp_pageaddr != self.lsn { - return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}: {:?}", - self.lsn >> 32, - self.lsn & 0xffffffff, hdr))); + return Err(WalDecodeError { msg: "invalid xlog page header".into(), lsn: self.lsn }); } // TODO: verify the remaining fields in the header @@ -159,10 +141,7 @@ impl WalStreamDecoder { self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - return Err(WalDecodeError::new(&format!("invalid xl_tot_len {} at {:X}/{:X}", - xl_tot_len, - self.lsn >> 32, - self.lsn & 0xffffffff))); + return Err(WalDecodeError {msg: format!("invalid xl_tot_len {}", xl_tot_len), lsn: self.lsn }); } self.lsn += 4; diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index 38a32bb730..fcc475826d 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -65,7 +65,7 @@ fn main() -> Result<()> { .get_matches(); let systemid_str = arg_matches.value_of("systemid").unwrap(); - let systemid = u64::from_str_radix(systemid_str, 10)?; + let systemid: u64 = systemid_str.parse()?; let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), diff --git a/walkeeper/src/pq_protocol.rs b/walkeeper/src/pq_protocol.rs index 8179a734b9..f6e18d9aa4 100644 --- a/walkeeper/src/pq_protocol.rs +++ b/walkeeper/src/pq_protocol.rs @@ -3,6 +3,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use pageserver::ZTimelineId; use std::io; use std::str; +use std::str::FromStr; pub type Oid = u32; pub type SystemId = u64; diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 3dc873e27b..54e28cda5b 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -601,9 +601,8 @@ impl Connection { fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> { let mut timelines = TIMELINES.lock().unwrap(); if !timelines.contains_key(&timelineid) { - let timeline_dir = timelineid.to_str(); - info!("creating timeline dir {}", &timeline_dir); - fs::create_dir_all(&timeline_dir)?; + info!("creating timeline dir {}", timelineid); + fs::create_dir_all(timelineid.to_string())?; timelines.insert(timelineid, Arc::new(Timeline::new(timelineid))); } self.timeline = Some(timelines.get(&timelineid).unwrap().clone()); @@ -1112,12 +1111,12 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.timeline().timelineid.to_str()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone()); let wal_file_partial_path = self .conf .data_dir - .join(self.timeline().timelineid.to_str()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone() + ".partial"); { diff --git a/zenith/src/main.rs b/zenith/src/main.rs index de29f386a0..1d0b5b73d4 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,10 +1,11 @@ use std::fs; use std::path::{Path, PathBuf}; use std::process::exit; +use std::str::FromStr; use clap::{App, Arg, ArgMatches, SubCommand}; use anyhow::Result; -use anyhow::*; +use anyhow::{anyhow, bail}; use control_plane::{compute::ComputeControlPlane, local_env, storage}; use control_plane::local_env::LocalEnv; @@ -12,10 +13,10 @@ use control_plane::storage::PageServerNode; use pageserver::ZTimelineId; -fn zenith_repo_dir() -> String { +fn zenith_repo_dir() -> PathBuf { // Find repository path match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => String::from(val.to_str().unwrap()), + Some(val) => PathBuf::from(val.to_str().unwrap()), None => ".zenith".into(), } } @@ -239,19 +240,20 @@ fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { } } else { // No arguments, list branches - list_branches(); + list_branches()?; } Ok(()) } -fn list_branches() { +fn list_branches() -> Result<()> { // list branches - let paths = fs::read_dir(zenith_repo_dir() + "/refs/branches").unwrap(); + let paths = fs::read_dir(zenith_repo_dir().join("refs").join("branches"))?; for path in paths { - let filename = path.unwrap().file_name().to_str().unwrap().to_owned(); - println!(" {}", filename); + println!(" {}", path?.file_name().to_str().unwrap()); } + + Ok(()) } // @@ -281,8 +283,8 @@ fn parse_point_in_time(s: &str) -> Result { let lsn: Option; if let Some(lsnstr) = strings.next() { let mut s = lsnstr.split("/"); - let lsn_hi: u64 = s.next().unwrap().parse()?; - let lsn_lo: u64 = s.next().unwrap().parse()?; + let lsn_hi: u64 = s.next().ok_or(anyhow!("invalid LSN in point-in-time specification"))?.parse()?; + let lsn_lo: u64 = s.next().ok_or(anyhow!("invalid LSN in point-in-time specification"))?.parse()?; lsn = Some(lsn_hi << 32 | lsn_lo); } else { @@ -291,7 +293,7 @@ fn parse_point_in_time(s: &str) -> Result { // Check if it's a tag if lsn.is_none() { - let tagpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/tags/" + name); + let tagpath = zenith_repo_dir().join("refs").join("tags").join(name); if tagpath.exists() { let pointstr = fs::read_to_string(tagpath)?; @@ -300,7 +302,7 @@ fn parse_point_in_time(s: &str) -> Result { } // Check if it's a branch // Check if it's branch @ LSN - let branchpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/branches/" + name); + let branchpath = zenith_repo_dir().join("refs").join("branches").join(name); if branchpath.exists() { let pointstr = fs::read_to_string(branchpath)?; @@ -315,7 +317,7 @@ fn parse_point_in_time(s: &str) -> Result { // Check if it's a timelineid // Check if it's timelineid @ LSN - let tlipath:PathBuf = PathBuf::from(zenith_repo_dir() + "/timelines/" + name); + let tlipath = zenith_repo_dir().join("timelines").join(name); if tlipath.exists() { let result = local_env::PointInTime { timelineid: ZTimelineId::from_str(name)?, From f520ef9a6490e0e0f0512a4a0c6e62744ed3357c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Apr 2021 19:26:27 +0300 Subject: [PATCH 10/15] Update 'postgres' submodule --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 5eaf718d3f..b898ad7e3b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 5eaf718d3f2fae700fb4902326a4c1d2cee87b51 +Subproject commit b898ad7e3b9acce72b64bf064257e392f979a659 From d8ab2e00cbf67eb71919c0165221c2d34f2bff8a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Apr 2021 19:34:48 +0300 Subject: [PATCH 11/15] Fix compilation failure caused by last minute change in relsize_inc() --- pageserver/src/restore_local_repo.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index e7418ac09d..279f13f848 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -203,7 +203,7 @@ fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimel relnode: relnode, forknum: forknum as u8, }; - pcache.relsize_inc(&tag, Some(blknum)); + pcache.relsize_inc(&tag, blknum); Ok(()) } From 7f777a485e6043a4ef0ca701eb7c081135ad05a7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Apr 2021 20:27:22 +0300 Subject: [PATCH 12/15] Fix caching of 'postgres' build in github action. The postgres_ext.h isn't found in vendor/postgres, if the Postgres was restored from cache instead of building it in vendor/postgres. To fix, change include path to point into tmp_install/include where the headers are installed, instead of the vendor/postgres source dir. --- pgbuild.sh | 6 ------ postgres_ffi/build.rs | 9 ++++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pgbuild.sh b/pgbuild.sh index 8ba1e2cbf9..9d4c0baa65 100755 --- a/pgbuild.sh +++ b/pgbuild.sh @@ -31,9 +31,3 @@ export DESTDIR=$REPO_ROOT/tmp_install echo "Installing postgres to $DESTDIR" make install -s - -#Configure postgres in src directory. We need it for postgres_ffi build -echo "Configuring postgres build in place" -cd ../../vendor/postgres/ -./configure CFLAGS='-O0' --enable-debug --enable-cassert \ - --enable-depend --with-libxml --prefix=/ > configure.log \ No newline at end of file diff --git a/postgres_ffi/build.rs b/postgres_ffi/build.rs index 95903bf051..97b3392b3e 100644 --- a/postgres_ffi/build.rs +++ b/postgres_ffi/build.rs @@ -23,7 +23,14 @@ fn main() { .whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .whitelist_type("DBState") - .clang_arg("-I../vendor/postgres/src/include") + // Path the server include dir. It is in tmp_install/include/server, if you did + // "configure --prefix=". But if you used "configure --prefix=/", + // and used DESTDIR to move it into tmp_install, then it's in + // tmp_install/include/postgres/server (that's how the pgbuild.sh script does it). + // 'pg_config --includedir-server' would perhaps be the more proper way to find it, + // but this will do for now. + .clang_arg("-I../tmp_install/include/server") + .clang_arg("-I../tmp_install/include/postgresql/server") // Finish the builder and generate the bindings. .generate() From f3877692030a69da1ef9dff9b500e4e5c5aa376f Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Tue, 20 Apr 2021 10:53:43 -0700 Subject: [PATCH 13/15] add zenith_utils crate This is a place for code that's shared between other crates in this repository. --- Cargo.lock | 4 ++++ Cargo.toml | 1 + zenith_utils/Cargo.toml | 7 +++++++ zenith_utils/src/lib.rs | 3 +++ 4 files changed, 15 insertions(+) create mode 100644 zenith_utils/Cargo.toml create mode 100644 zenith_utils/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ad2a518f22..259799d80f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2678,3 +2678,7 @@ dependencies = [ "postgres_ffi", "walkeeper", ] + +[[package]] +name = "zenith_utils" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 3e9c59ce3e..d242faaaee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,4 +6,5 @@ members = [ "zenith", "control_plane", "postgres_ffi", + "zenith_utils", ] diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml new file mode 100644 index 0000000000..77bc1e9ecb --- /dev/null +++ b/zenith_utils/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "zenith_utils" +version = "0.1.0" +authors = ["Eric Seppanen "] +edition = "2018" + +[dependencies] diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs new file mode 100644 index 0000000000..3b833f4c2a --- /dev/null +++ b/zenith_utils/src/lib.rs @@ -0,0 +1,3 @@ +//! zenith_utils is intended to be a place to put code that is shared +//! between other crates in this repository. + From b5a5ea583127042d03f1f08c625e1db5d117b557 Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Tue, 20 Apr 2021 13:21:02 -0700 Subject: [PATCH 14/15] update README: "zenith pageserver start" The old command was "zenith start", which no longer works. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b7c745bcb8..2836a71604 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ cargo build >./target/debug/zenith init # start pageserver -> ./target/debug/zenith start +> ./target/debug/zenith pageserver start Starting pageserver at '127.0.0.1:64000' # create and configure postgres data dir From 92e4f4b3b650c05df615922e66c090a165355ac4 Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Tue, 20 Apr 2021 17:59:56 -0700 Subject: [PATCH 15/15] cargo fmt --- control_plane/src/compute.rs | 92 ++++++++----- control_plane/src/local_env.rs | 133 ++++++++++++------- control_plane/src/storage.rs | 32 +++-- integration_tests/tests/test_pageserver.rs | 16 ++- integration_tests/tests/test_wal_acceptor.rs | 29 ++-- pageserver/src/basebackup.rs | 25 ++-- pageserver/src/bin/pageserver.rs | 46 ++++--- pageserver/src/lib.rs | 8 +- pageserver/src/page_cache.rs | 29 ++-- pageserver/src/page_service.rs | 50 +++---- pageserver/src/restore_local_repo.rs | 91 +++++++++---- pageserver/src/waldecoder.rs | 18 ++- pageserver/src/walreceiver.rs | 62 ++++++--- pageserver/src/walredo.rs | 4 +- postgres_ffi/build.rs | 15 +-- postgres_ffi/src/lib.rs | 23 ++-- walkeeper/src/bin/wal_acceptor.rs | 2 +- walkeeper/src/wal_service.rs | 37 ++++-- zenith/src/main.rs | 67 +++++----- zenith_utils/src/lib.rs | 1 - 20 files changed, 480 insertions(+), 300 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 91ad2ba805..df59bf439a 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,17 +1,17 @@ use std::fs::{self, OpenOptions}; -use std::os::unix::fs::PermissionsExt; +use std::io::{Read, Write}; +use std::net::SocketAddr; use std::net::TcpStream; +use std::os::unix::fs::PermissionsExt; use std::process::Command; use std::sync::Arc; use std::time::Duration; use std::{collections::BTreeMap, path::PathBuf}; -use std::io::{Read, Write}; -use std::net::SocketAddr; -use regex::Regex; -use lazy_static::lazy_static; -use tar; use anyhow::{Context, Result}; +use lazy_static::lazy_static; +use regex::Regex; +use tar; use postgres::{Client, NoTls}; @@ -75,7 +75,11 @@ impl ComputeControlPlane { /// Connect to a page server, get base backup, and untar it to initialize a /// new data directory - pub fn new_from_page_server(&mut self, is_test: bool, timelineid: ZTimelineId) -> Result> { + pub fn new_from_page_server( + &mut self, + is_test: bool, + timelineid: ZTimelineId, + ) -> Result> { let node_id = self.nodes.len() as u32 + 1; let node = Arc::new(PostgresNode { @@ -84,7 +88,7 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test, - timelineid + timelineid, }); node.init_from_page_server()?; @@ -157,8 +161,10 @@ impl PostgresNode { pageserver: &Arc, ) -> Result { if !entry.file_type()?.is_dir() { - anyhow::bail!("PostgresNode::from_dir_entry failed: '{}' is not a directory", - entry.path().display()); + anyhow::bail!( + "PostgresNode::from_dir_entry failed: '{}' is not a directory", + entry.path().display() + ); } lazy_static! { @@ -171,9 +177,12 @@ impl PostgresNode { // find out tcp port in config file let cfg_path = entry.path().join("postgresql.conf"); - let config = fs::read_to_string(cfg_path.clone()) - .with_context(|| format!("failed to read config file in {}", - cfg_path.to_str().unwrap()))?; + let config = fs::read_to_string(cfg_path.clone()).with_context(|| { + format!( + "failed to read config file in {}", + cfg_path.to_str().unwrap() + ) + })?; let err_msg = format!( "failed to find port definition in config file {}", @@ -203,14 +212,13 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timelineid + timelineid, }) } // Connect to a page server, get base backup, and untar it to initialize a // new data directory pub fn init_from_page_server(&self) -> Result<()> { - let pgdata = self.pgdata(); println!( @@ -225,26 +233,37 @@ impl PostgresNode { } let sql = format!("basebackup {}", self.timelineid); - let mut client = self.pageserver.page_server_psql_client().with_context(|| "connecting to page erver failed")?; + let mut client = self + .pageserver + .page_server_psql_client() + .with_context(|| "connecting to page erver failed")?; fs::create_dir_all(&pgdata) .with_context(|| format!("could not create data directory {}", pgdata.display()))?; - fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)) - .with_context(|| format!("could not set permissions in data directory {}", pgdata.display()))?; + fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context( + || { + format!( + "could not set permissions in data directory {}", + pgdata.display() + ) + }, + )?; // FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive. // But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that // we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here. //fs::create_dir_all(pgdata.join("pg_wal"))?; - let mut copyreader = client.copy_out(sql.as_str()) + let mut copyreader = client + .copy_out(sql.as_str()) .with_context(|| "page server 'basebackup' command failed")?; // FIXME: Currently, we slurp the whole tarball into memory, and then extract it, // but we really should do this: //let mut ar = tar::Archive::new(copyreader); let mut buf = vec![]; - copyreader.read_to_end(&mut buf) + copyreader + .read_to_end(&mut buf) .with_context(|| "reading base backup from page server failed")?; let mut ar = tar::Archive::new(buf.as_slice()); ar.unpack(&pgdata) @@ -264,25 +283,28 @@ impl PostgresNode { port = {port}\n", address = self.address.ip(), port = self.address.port() - )); + ), + ); // Never clean up old WAL. TODO: We should use a replication // slot or something proper, to prevent the compute node // from removing WAL that hasn't been streamed to the safekeepr or // page server yet. But this will do for now. - self.append_conf("postgresql.conf", - &format!("wal_keep_size='10TB'\n")); + self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n")); // Connect it to the page server. // Configure that node to take pages from pageserver - self.append_conf("postgresql.conf", - &format!("page_server_connstring = 'host={} port={}'\n\ + self.append_conf( + "postgresql.conf", + &format!( + "page_server_connstring = 'host={} port={}'\n\ zenith_timeline='{}'\n", - self.pageserver.address().ip(), - self.pageserver.address().port(), - self.timelineid - )); + self.pageserver.address().ip(), + self.pageserver.address().port(), + self.timelineid + ), + ); Ok(()) } @@ -331,7 +353,8 @@ impl PostgresNode { ) .env_clear() .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status().with_context(|| "pg_ctl failed")?; + .status() + .with_context(|| "pg_ctl failed")?; if !pg_ctl.success() { anyhow::bail!("pg_ctl failed"); } @@ -406,10 +429,13 @@ impl PostgresNode { .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") - .stderr(OpenOptions::new() + .stderr( + OpenOptions::new() .create(true) - .append(true) - .open(self.pgdata().join("safekeeper_proxy.log")).unwrap()) + .append(true) + .open(self.pgdata().join("safekeeper_proxy.log")) + .unwrap(), + ) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e2c310f733..adf5d6164c 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -4,19 +4,19 @@ // Now it also provides init method which acts like a stub for proper installation // script which will use local paths. // +use anyhow::Context; +use bytes::Bytes; +use rand::Rng; use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; -use bytes::Bytes; -use rand::Rng; -use anyhow::Context; -use serde_derive::{Deserialize, Serialize}; use anyhow::Result; +use serde_derive::{Deserialize, Serialize}; -use walkeeper::xlog_utils; use pageserver::ZTimelineId; +use walkeeper::xlog_utils; // // This data structure represents deserialized zenith config, which should be @@ -67,16 +67,20 @@ pub fn init() -> Result<()> { // check if config already exists let repo_path = zenith_repo_dir(); if repo_path.exists() { - anyhow::bail!("{} already exists. Perhaps already initialized?", - repo_path.to_str().unwrap()); + anyhow::bail!( + "{} already exists. Perhaps already initialized?", + repo_path.to_str().unwrap() + ); } // Now we can run init only from crate directory, so check that current dir is our crate. // Use 'pageserver/Cargo.toml' existence as evidendce. let cargo_path = env::current_dir()?; if !cargo_path.join("pageserver/Cargo.toml").exists() { - anyhow::bail!("Current dirrectory does not look like a zenith repo. \ - Please, run 'init' from zenith repo root."); + anyhow::bail!( + "Current dirrectory does not look like a zenith repo. \ + Please, run 'init' from zenith repo root." + ); } // ok, now check that expected binaries are present @@ -85,17 +89,21 @@ pub fn init() -> Result<()> { let pg_distrib_dir = cargo_path.join("tmp_install"); let pg_path = pg_distrib_dir.join("bin/postgres"); if !pg_path.exists() { - anyhow::bail!("Can't find postres binary at {}. \ + anyhow::bail!( + "Can't find postres binary at {}. \ Perhaps './pgbuild.sh' is needed to build it first.", - pg_path.to_str().unwrap()); + pg_path.to_str().unwrap() + ); } // check pageserver let zenith_distrib_dir = cargo_path.join("target/debug/"); let pageserver_path = zenith_distrib_dir.join("pageserver"); if !pageserver_path.exists() { - anyhow::bail!("Can't find pageserver binary at {}. Please build it.", - pageserver_path.to_str().unwrap()); + anyhow::bail!( + "Can't find pageserver binary at {}. Please build it.", + pageserver_path.to_str().unwrap() + ); } // ok, we are good to go @@ -110,10 +118,10 @@ pub fn init() -> Result<()> { Ok(()) } -pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> -{ +pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> { let repopath = &local_env.repo_path; - fs::create_dir(&repopath).with_context(|| format!("could not create directory {}", repopath.display()))?; + fs::create_dir(&repopath) + .with_context(|| format!("could not create directory {}", repopath.display()))?; fs::create_dir(repopath.join("pgdatadirs"))?; fs::create_dir(repopath.join("timelines"))?; fs::create_dir(repopath.join("refs"))?; @@ -132,25 +140,30 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> // the repository. Use "tempdir()" or something? Or just create it directly // in the repo? let initdb_path = local_env.pg_bin_dir().join("initdb"); - let _initdb = - Command::new(initdb_path) + let _initdb = Command::new(initdb_path) .args(&["-D", "tmp"]) - .arg("--no-instructions") - .env_clear() - .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) + .arg("--no-instructions") + .env_clear() + .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) + .stdout(Stdio::null()) .status() .with_context(|| "failed to execute initdb")?; println!("initdb succeeded"); // Read control file to extract the LSN and system id - let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; + let controlfile = + postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; let systemid = controlfile.system_identifier; let lsn = controlfile.checkPoint; let lsnstr = format!("{:016X}", lsn); // Move the initial WAL file - fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.join("wal").join("000000010000000000000001.partial"))?; + fs::rename( + "tmp/pg_wal/000000010000000000000001", + timelinedir + .join("wal") + .join("000000010000000000000001.partial"), + )?; println!("moved initial WAL file"); // Remove pg_wal @@ -176,12 +189,14 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> let toml = toml::to_string(&local_env)?; fs::write(repopath.join("config"), toml)?; - println!("new zenith repository was created in {}", repopath.display()); + println!( + "new zenith repository was created in {}", + repopath.display() + ); Ok(()) } - // If control file says the cluster was shut down cleanly, modify it, to mark // it as crashed. That forces crash recovery when you start the cluster. // @@ -192,16 +207,17 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> // Or better yet, use a less hacky way of putting the cluster into recovery. // Perhaps create a backup label file in the data directory when it's restored. fn force_crash_recovery(datadir: &Path) -> Result<()> { - // Read in the control file let controlfilepath = datadir.to_path_buf().join("global").join("pg_control"); - let mut controlfile = postgres_ffi::decode_pg_control( - Bytes::from(fs::read(controlfilepath.as_path())?))?; + let mut controlfile = + postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?; controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION; - fs::write(controlfilepath.as_path(), - postgres_ffi::encode_pg_control(controlfile))?; + fs::write( + controlfilepath.as_path(), + postgres_ffi::encode_pg_control(controlfile), + )?; Ok(()) } @@ -209,8 +225,10 @@ fn force_crash_recovery(datadir: &Path) -> Result<()> { // check that config file is present pub fn load_config(repopath: &Path) -> Result { if !repopath.exists() { - anyhow::bail!("Zenith config is not found in {}. You need to run 'zenith init' first", - repopath.to_str().unwrap()); + anyhow::bail!( + "Zenith config is not found in {}. You need to run 'zenith init' first", + repopath.to_str().unwrap() + ); } // load and parse file @@ -222,7 +240,9 @@ pub fn load_config(repopath: &Path) -> Result { pub fn test_env(testname: &str) -> LocalEnv { fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check"); - let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check/").join(testname); + let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../tmp_check/") + .join(testname); // Remove remnants of old test repo let _ = fs::remove_dir_all(&repo_path); @@ -252,7 +272,7 @@ pub fn cargo_bin_dir() -> PathBuf { #[derive(Debug, Clone, Copy)] pub struct PointInTime { pub timelineid: ZTimelineId, - pub lsn: u64 + pub lsn: u64, } fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result { @@ -270,10 +290,12 @@ fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Resul fs::create_dir(&timelinedir.join("wal"))?; if let Some(ancestor) = ancestor { - let data = format!("{}@{:X}/{:X}", - ancestor.timelineid, - ancestor.lsn >> 32, - ancestor.lsn & 0xffffffff); + let data = format!( + "{}@{:X}/{:X}", + ancestor.timelineid, + ancestor.lsn >> 32, + ancestor.lsn & 0xffffffff + ); fs::write(timelinedir.join("ancestor"), data)?; } @@ -289,7 +311,11 @@ fn parse_lsn(s: &str) -> std::result::Result { } // Create a new branch in the repository (for the "zenith branch" subcommand) -pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointInTime) -> Result<()> { +pub fn create_branch( + local_env: &LocalEnv, + branchname: &str, + startpoint: PointInTime, +) -> Result<()> { let repopath = &local_env.repo_path; // create a new timeline for it @@ -297,7 +323,10 @@ pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointIn let newtimelinedir = repopath.join("timelines").join(newtli.to_string()); let data = newtli.to_string(); - fs::write(repopath.join("refs").join("branches").join(branchname), data)?; + fs::write( + repopath.join("refs").join("branches").join(branchname), + data, + )?; // Copy the latest snapshot (TODO: before the startpoint) and all WAL // TODO: be smarter and avoid the copying... @@ -305,12 +334,16 @@ pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointIn let copy_opts = fs_extra::dir::CopyOptions::new(); fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?; - let oldtimelinedir = repopath.join("timelines").join(startpoint.timelineid.to_string()); + let oldtimelinedir = repopath + .join("timelines") + .join(startpoint.timelineid.to_string()); let mut copy_opts = fs_extra::dir::CopyOptions::new(); copy_opts.content_only = true; - fs_extra::dir::copy(oldtimelinedir.join("wal"), - newtimelinedir.join("wal"), - ©_opts)?; + fs_extra::dir::copy( + oldtimelinedir.join("wal"), + newtimelinedir.join("wal"), + ©_opts, + )?; Ok(()) } @@ -318,7 +351,10 @@ pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointIn // Find the end of valid WAL in a wal directory pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result { let repopath = &local_env.repo_path; - let waldir = repopath.join("timelines").join(timeline.to_string()).join("wal"); + let waldir = repopath + .join("timelines") + .join(timeline.to_string()) + .join("wal"); let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true); @@ -329,7 +365,10 @@ pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result Result<(u64, PathBuf)> { let repopath = &local_env.repo_path; - let snapshotsdir = repopath.join("timelines").join(timeline.to_string()).join("snapshots"); + let snapshotsdir = repopath + .join("timelines") + .join(timeline.to_string()) + .join("snapshots"); let paths = fs::read_dir(&snapshotsdir)?; let mut maxsnapshot: u64 = 0; let mut snapshotdir: Option = None; diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 5e5e2bff51..ab6a6d021d 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,3 +1,4 @@ +use anyhow::Result; use std::fs; use std::io; use std::net::SocketAddr; @@ -9,12 +10,11 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; -use anyhow::Result; use postgres::{Client, NoTls}; -use crate::local_env::LocalEnv; use crate::compute::PostgresNode; +use crate::local_env::LocalEnv; use pageserver::ZTimelineId; // @@ -31,10 +31,8 @@ pub struct TestStorageControlPlane { } impl TestStorageControlPlane { - // Peek into the repository, to grab the timeline ID of given branch pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId { - let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname); ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() @@ -171,10 +169,14 @@ impl PageServerNode { } pub fn start(&self) -> Result<()> { - println!("Starting pageserver at '{}' in {}", self.address(), self.repo_path().display()); + println!( + "Starting pageserver at '{}' in {}", + self.address(), + self.repo_path().display() + ); let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver")); - cmd .args(&["-l", self.address().to_string().as_str()]) + cmd.args(&["-l", self.address().to_string().as_str()]) .arg("-d") .env_clear() .env("RUST_BACKTRACE", "1") @@ -183,8 +185,10 @@ impl PageServerNode { .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); if !cmd.status()?.success() { - anyhow::bail!("Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display()); + anyhow::bail!( + "Pageserver failed to start. See '{}' for details.", + self.repo_path().join("pageserver.log").display() + ); } // It takes a while for the page server to start up. Wait until it is @@ -247,7 +251,9 @@ impl PageServerNode { client.simple_query(sql).unwrap() } - pub fn page_server_psql_client(&self) -> std::result::Result { + pub fn page_server_psql_client( + &self, + ) -> std::result::Result { let connstring = format!( "host={} port={} dbname={} user={}", self.address().ip(), @@ -297,10 +303,10 @@ impl WalAcceptorNode { .args(&["-D", self.data_dir.to_str().unwrap()]) .args(&["-l", self.listen.to_string().as_str()]) .args(&["--systemid", &self.env.systemid.to_string()]) - // Tell page server it can receive WAL from this WAL safekeeper - // FIXME: If there are multiple safekeepers, they will all inform - // the page server. Only the last "notification" will stay in effect. - // So it's pretty random which safekeeper the page server will connect to + // Tell page server it can receive WAL from this WAL safekeeper + // FIXME: If there are multiple safekeepers, they will all inform + // the page server. Only the last "notification" will stay in effect. + // So it's pretty random which safekeeper the page server will connect to .args(&["--pageserver", "127.0.0.1:64000"]) .arg("-d") .arg("-n") diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 14c328be0e..d595a6a50f 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -1,8 +1,8 @@ // mod control_plane; use control_plane::compute::ComputeControlPlane; -use control_plane::storage::TestStorageControlPlane; use control_plane::local_env; use control_plane::local_env::PointInTime; +use control_plane::storage::TestStorageControlPlane; // XXX: force all redo at the end // -- restart + seqscan won't read deleted stuff @@ -77,12 +77,18 @@ fn test_pageserver_two_timelines() { let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let maintli = storage_cplane.get_branch_timeline("main"); - + // Create new branch at the end of 'main' let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); - local_env::create_branch(&local_env, "experimental", - PointInTime { timelineid: maintli, - lsn: startpoint }).unwrap(); + local_env::create_branch( + &local_env, + "experimental", + PointInTime { + timelineid: maintli, + lsn: startpoint, + }, + ) + .unwrap(); let experimentaltli = storage_cplane.get_branch_timeline("experimental"); // Launch postgres instances on both branches diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index 04ca933d74..939648b2ea 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -1,8 +1,8 @@ // Restart acceptors one by one while compute is under the load. use control_plane::compute::ComputeControlPlane; -use control_plane::storage::TestStorageControlPlane; use control_plane::local_env; use control_plane::local_env::PointInTime; +use control_plane::storage::TestStorageControlPlane; use pageserver::ZTimelineId; use rand::Rng; @@ -63,11 +63,18 @@ fn test_many_timelines() { let maintli = storage_cplane.get_branch_timeline("main"); // main branch timelines.push(maintli); let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); - for i in 1..N_TIMELINES { // additional branches + for i in 1..N_TIMELINES { + // additional branches let branchname = format!("experimental{}", i); - local_env::create_branch(&local_env, &branchname, - PointInTime { timelineid: maintli, - lsn: startpoint }).unwrap(); + local_env::create_branch( + &local_env, + &branchname, + PointInTime { + timelineid: maintli, + lsn: startpoint, + }, + ) + .unwrap(); let tli = storage_cplane.get_branch_timeline(&branchname); timelines.push(tli); } @@ -75,10 +82,10 @@ fn test_many_timelines() { // start postgres on each timeline let mut nodes = Vec::new(); for tli in timelines { - let node = compute_cplane.new_test_node(tli); - nodes.push(node.clone()); - node.start().unwrap(); - node.start_proxy(&wal_acceptors); + let node = compute_cplane.new_test_node(tli); + nodes.push(node.clone()); + node.start().unwrap(); + node.start_proxy(&wal_acceptors); } // create schema @@ -258,7 +265,9 @@ fn test_race_conditions() { // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; - let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY)); + let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant( + &local_env, REDUNDANCY, + )); let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 76ca3c3377..d8ed5183a8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -1,14 +1,17 @@ use log::*; -use tar::{Builder}; +use regex::Regex; use std::fmt; use std::io::Write; +use tar::Builder; use walkdir::WalkDir; -use regex::Regex; use crate::ZTimelineId; - -pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, snapshotlsn: u64) -> Result<(), std::io::Error> { +pub fn send_snapshot_tarball( + write: &mut dyn Write, + timelineid: ZTimelineId, + snapshotlsn: u64, +) -> Result<(), std::io::Error> { let mut ar = Builder::new(write); let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn); @@ -27,12 +30,15 @@ pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, sna } if entry.file_type().is_dir() { - trace!("sending dir {} as {}", fullpath.display(), relpath.display()); + trace!( + "sending dir {} as {}", + fullpath.display(), + relpath.display() + ); ar.append_dir(relpath, fullpath)?; } else if entry.file_type().is_symlink() { error!("ignoring symlink in snapshot dir"); } else if entry.file_type().is_file() { - // Shared catalogs are exempt if relpath.starts_with("global/") { trace!("sending shared catalog {}", relpath.display()); @@ -61,7 +67,9 @@ pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, sna } let archive_fname = relpath.to_str().unwrap().clone(); - let archive_fname = archive_fname.strip_suffix(".partial").unwrap_or(&archive_fname); + let archive_fname = archive_fname + .strip_suffix(".partial") + .unwrap_or(&archive_fname); let archive_path = "pg_wal/".to_owned() + archive_fname; ar.append_path_with_name(fullpath, archive_path)?; } @@ -71,14 +79,12 @@ pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, sna Ok(()) } - // formats: // // _ // . // _. - #[derive(Debug)] struct FilePathError { msg: String, @@ -145,7 +151,6 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { return Ok((relnode, forknum, segno)); } - fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> { /* * Relation data files can be in one of the following directories: diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 10336d84f5..f8dfc32c5e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,11 +4,11 @@ use log::*; use std::fs; +use std::fs::{File, OpenOptions}; use std::io; +use std::path::PathBuf; use std::process::exit; use std::thread; -use std::fs::{File, OpenOptions}; -use std::path::PathBuf; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -32,27 +32,33 @@ fn zenith_repo_dir() -> String { fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") - .arg(Arg::with_name("listen") - .short("l") - .long("listen") - .takes_value(true) - .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)")) - .arg(Arg::with_name("interactive") - .short("i") - .long("interactive") - .takes_value(false) - .help("Interactive mode")) - .arg(Arg::with_name("daemonize") - .short("d") - .long("daemonize") - .takes_value(false) - .help("Run in the background")) + .arg( + Arg::with_name("listen") + .short("l") + .long("listen") + .takes_value(true) + .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"), + ) + .arg( + Arg::with_name("interactive") + .short("i") + .long("interactive") + .takes_value(false) + .help("Interactive mode"), + ) + .arg( + Arg::with_name("daemonize") + .short("d") + .long("daemonize") + .takes_value(false) + .help("Run in the background"), + ) .get_matches(); let mut conf = PageServerConf { daemonize: false, interactive: false, - listen_addr: "127.0.0.1:5430".parse().unwrap() + listen_addr: "127.0.0.1:5430".parse().unwrap(), }; if arg_matches.is_present("daemonize") { @@ -128,9 +134,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { Ok(_) => info!("Success, daemonized"), Err(e) => error!("Error, {}", e), } - } - else - { + } else { // change into the repository directory. In daemon mode, Daemonize // does this for us. let repodir = zenith_repo_dir(); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c738f90f41..4c344749f5 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,7 +1,8 @@ +use std::fmt; use std::net::SocketAddr; use std::str::FromStr; -use std::fmt; +pub mod basebackup; pub mod page_cache; pub mod page_service; pub mod pg_constants; @@ -12,7 +13,6 @@ mod tui_logger; pub mod waldecoder; pub mod walreceiver; pub mod walredo; -pub mod basebackup; #[derive(Debug, Clone)] pub struct PageServerConf { @@ -35,7 +35,6 @@ impl FromStr for ZTimelineId { buf.copy_from_slice(timelineid.as_slice()); Ok(ZTimelineId(buf)) } - } impl ZTimelineId { @@ -55,8 +54,7 @@ impl ZTimelineId { } impl fmt::Display for ZTimelineId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&hex::encode(self.0)) } } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index edeba3b21f..c25e5cadb5 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -6,9 +6,9 @@ // per-entry mutex. // -use crate::{walredo, PageServerConf}; use crate::restore_local_repo::restore_timeline; use crate::ZTimelineId; +use crate::{walredo, PageServerConf}; use anyhow::bail; use bytes::Bytes; use core::ops::Bound::Included; @@ -109,7 +109,8 @@ struct PageCacheShared { } lazy_static! { - pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); + pub static ref PAGECACHES: Mutex>> = + Mutex::new(HashMap::new()); } // Get Page Cache for given timeline. It is assumed to already exist. @@ -118,11 +119,14 @@ pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option< match pcaches.get(&timelineid) { Some(pcache) => Some(pcache.clone()), - None => None + None => None, } } -pub fn get_or_restore_pagecache(conf: &PageServerConf, timelineid: ZTimelineId) -> anyhow::Result> { +pub fn get_or_restore_pagecache( + conf: &PageServerConf, + timelineid: ZTimelineId, +) -> anyhow::Result> { let mut pcaches = PAGECACHES.lock().unwrap(); match pcaches.get(&timelineid) { @@ -475,8 +479,11 @@ impl PageCache { self.num_entries.fetch_add(1, Ordering::Relaxed); if !oldentry.is_none() { - error!("overwriting WAL record with LSN {:X}/{:X} in page cache", - lsn >> 32, lsn & 0xffffffff); + error!( + "overwriting WAL record with LSN {:X}/{:X} in page cache", + lsn >> 32, + lsn & 0xffffffff + ); } self.num_wal_records.fetch_add(1, Ordering::Relaxed); @@ -511,14 +518,18 @@ impl PageCache { // Can't move backwards. let oldlsn = shared.last_valid_lsn; if lsn >= oldlsn { - shared.last_valid_lsn = lsn; self.valid_lsn_condvar.notify_all(); self.last_valid_lsn.store(lsn, Ordering::Relaxed); } else { - warn!("attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})", - oldlsn >> 32, oldlsn & 0xffffffff, lsn >> 32, lsn & 0xffffffff); + warn!( + "attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})", + oldlsn >> 32, + oldlsn & 0xffffffff, + lsn >> 32, + lsn & 0xffffffff + ); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6dac213be6..c23537233d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,26 +13,25 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use log::*; +use regex::Regex; use std::io; -use std::thread; use std::str::FromStr; use std::sync::Arc; -use regex::Regex; +use std::thread; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; use tokio::net::{TcpListener, TcpStream}; use tokio::runtime; use tokio::runtime::Runtime; -use tokio::task; use tokio::sync::mpsc; +use tokio::task; +use crate::basebackup; use crate::page_cache; use crate::restore_local_repo; -use crate::basebackup; use crate::walreceiver; use crate::PageServerConf; use crate::ZTimelineId; - type Result = std::result::Result; #[derive(Debug)] @@ -172,8 +171,7 @@ struct FeParseMessage { query_string: Bytes, } -fn read_null_terminated(buf: &mut Bytes) -> Result -{ +fn read_null_terminated(buf: &mut Bytes) -> Result { let mut result = BytesMut::new(); loop { @@ -221,15 +219,14 @@ impl FeParseMessage { )); } - - Ok(FeMessage::Parse(FeParseMessage {query_string})) + Ok(FeMessage::Parse(FeParseMessage { query_string })) } } #[derive(Debug)] struct FeDescribeMessage { - kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. - // we only support unnamed prepared stmt or portal + kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. + // we only support unnamed prepared stmt or portal } impl FeDescribeMessage { @@ -255,7 +252,7 @@ impl FeDescribeMessage { )); } - Ok(FeMessage::Describe(FeDescribeMessage {kind})) + Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } @@ -263,7 +260,7 @@ impl FeDescribeMessage { #[derive(Debug)] struct FeExecuteMessage { /// max # of rows - maxrows: i32 + maxrows: i32, } impl FeExecuteMessage { @@ -286,14 +283,13 @@ impl FeExecuteMessage { )); } - Ok(FeMessage::Execute(FeExecuteMessage {maxrows})) + Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } // we only support unnamed prepared stmt and portal #[derive(Debug)] -struct FeBindMessage { -} +struct FeBindMessage {} impl FeBindMessage { pub fn parse(body: Bytes) -> Result { @@ -324,8 +320,7 @@ impl FeBindMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -struct FeCloseMessage { -} +struct FeCloseMessage {} impl FeCloseMessage { pub fn parse(body: Bytes) -> Result { @@ -370,9 +365,7 @@ impl FeMessage { let mut body = body.freeze(); match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { - body: body, - }))), + b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body: body }))), b'P' => Ok(Some(FeParseMessage::parse(body)?)), b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), @@ -634,7 +627,6 @@ impl Connection { } async fn run(&mut self) -> Result<()> { - let mut unnamed_query_string = Bytes::new(); loop { let msg = self.read_message().await?; @@ -666,7 +658,8 @@ impl Connection { self.write_message(&BeMessage::ParseComplete).await?; } Some(FeMessage::Describe(_)) => { - self.write_message_noflush(&BeMessage::ParameterDescription).await?; + self.write_message_noflush(&BeMessage::ParameterDescription) + .await?; self.write_message(&BeMessage::NoData).await?; } Some(FeMessage::Bind(_)) => { @@ -724,10 +717,13 @@ impl Connection { // Check that the timeline exists self.handle_basebackup_request(timelineid).await?; - self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; self.write_message(&BeMessage::ReadyForQuery).await } else if query_string.starts_with(b"callmemaybe ") { - let query_str = String::from_utf8(query_string.to_vec()).unwrap().to_string(); + let query_str = String::from_utf8(query_string.to_vec()) + .unwrap() + .to_string(); // callmemaybe let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap(); @@ -777,7 +773,6 @@ impl Connection { } async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> { - // Check that the timeline exists let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); if pcache.is_err() { @@ -954,7 +949,7 @@ impl Connection { if joinres.is_err() { return Err(io::Error::new( io::ErrorKind::InvalidData, - joinres.unwrap_err() + joinres.unwrap_err(), )); } return joinres.unwrap(); @@ -1002,7 +997,6 @@ struct CopyDataSink(mpsc::Sender); impl std::io::Write for CopyDataSink { fn write(&mut self, data: &[u8]) -> std::result::Result { - let buf = Bytes::copy_from_slice(data); if let Err(e) = self.0.blocking_send(buf) { diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index 279f13f848..262479a556 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -14,6 +14,7 @@ use log::*; use regex::Regex; use std::fmt; +use std::cmp::max; use std::error::Error; use std::fs; use std::fs::File; @@ -21,19 +22,17 @@ use std::io::Read; use std::io::Seek; use std::io::SeekFrom; use std::path::{Path, PathBuf}; -use std::cmp::max; use anyhow::Result; use bytes::Bytes; use crate::page_cache; -use crate::page_cache::PageCache; -use crate:: PageServerConf; use crate::page_cache::BufferTag; +use crate::page_cache::PageCache; use crate::waldecoder::WalStreamDecoder; +use crate::PageServerConf; use crate::ZTimelineId; - // From pg_tablespace_d.h // // FIXME: we'll probably need these elsewhere too, move to some common location @@ -43,8 +42,11 @@ const GLOBALTABLESPACE_OID: u32 = 1664; // // Load it all into the page cache. // -pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId) -> Result<()> { - +pub fn restore_timeline( + conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, +) -> Result<()> { let timelinepath = PathBuf::from("timelines").join(timeline.to_string()); if !timelinepath.exists() { @@ -52,7 +54,9 @@ pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTi } // Scan .zenith/timelines//snapshots - let snapshotspath = PathBuf::from("timelines").join(timeline.to_string()).join("snapshots"); + let snapshotspath = PathBuf::from("timelines") + .join(timeline.to_string()) + .join("snapshots"); let mut last_snapshot_lsn: u64 = 0; @@ -68,7 +72,10 @@ pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTi } if last_snapshot_lsn == 0 { - error!("could not find valid snapshot in {}", snapshotspath.display()); + error!( + "could not find valid snapshot in {}", + snapshotspath.display() + ); // TODO return error? } pcache.init_valid_lsn(last_snapshot_lsn); @@ -79,7 +86,6 @@ pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTi } pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result { - let snapshotspath = format!("timelines/{}/snapshots", timeline); let mut last_snapshot_lsn = 0; @@ -97,9 +103,16 @@ pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Re Ok(last_snapshot_lsn) } -fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, snapshot: &str) -> Result<()> { - - let snapshotpath = PathBuf::from("timelines").join(timeline.to_string()).join("snapshots").join(snapshot); +fn restore_snapshot( + conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, + snapshot: &str, +) -> Result<()> { + let snapshotpath = PathBuf::from("timelines") + .join(timeline.to_string()) + .join("snapshots") + .join(snapshot); // Scan 'global' for direntry in fs::read_dir(snapshotpath.join("global"))? { @@ -112,7 +125,15 @@ fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimeli Some("pg_filenode.map") => continue, // Load any relation files into the page server - _ => restore_relfile(conf, pcache, timeline, snapshot, GLOBALTABLESPACE_OID, 0, &direntry.path())?, + _ => restore_relfile( + conf, + pcache, + timeline, + snapshot, + GLOBALTABLESPACE_OID, + 0, + &direntry.path(), + )?, } } @@ -133,7 +154,15 @@ fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimeli Some("pg_filenode.map") => continue, // Load any relation files into the page server - _ => restore_relfile(conf, pcache, timeline, snapshot, DEFAULTTABLESPACE_OID, dboid, &direntry.path())?, + _ => restore_relfile( + conf, + pcache, + timeline, + snapshot, + DEFAULTTABLESPACE_OID, + dboid, + &direntry.path(), + )?, } } } @@ -143,8 +172,15 @@ fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimeli Ok(()) } -fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimelineId, snapshot: &str, spcoid: u32, dboid: u32, path: &Path) -> Result<()> { - +fn restore_relfile( + _conf: &PageServerConf, + pcache: &PageCache, + _timeline: ZTimelineId, + snapshot: &str, + spcoid: u32, + dboid: u32, + path: &Path, +) -> Result<()> { let lsn = u64::from_str_radix(snapshot, 16)?; // Does it look like a relation file? @@ -187,12 +223,12 @@ fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimel // reached EOF. That's expected. // FIXME: maybe check that we read the full length of the file? break; - }, + } _ => { error!("error reading file: {:?} ({})", path, e); break; } - } + }, }; blknum += 1; } @@ -210,7 +246,12 @@ fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimel // Scan WAL on a timeline, starting from gien LSN, and load all the records // into the page cache. -fn restore_wal(_conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, startpoint: u64) -> Result<()> { +fn restore_wal( + _conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, + startpoint: u64, +) -> Result<()> { let walpath = format!("timelines/{}/wal", timeline); let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint)); @@ -259,8 +300,7 @@ fn restore_wal(_conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId break; } if let Some((lsn, recdata)) = rec.unwrap() { - let decoded = - crate::waldecoder::decode_wal_record(recdata.clone()); + let decoded = crate::waldecoder::decode_wal_record(recdata.clone()); // Put the WAL record to the page cache. We make a separate copy of // it for every block it modifies. (The actual WAL record is kept in @@ -299,7 +339,11 @@ fn restore_wal(_conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId segno += 1; offset = 0; } - info!("reached end of WAL at {:X}/{:X}", last_lsn >> 32, last_lsn & 0xffffffff); + info!( + "reached end of WAL at {:X}/{:X}", + last_lsn >> 32, + last_lsn & 0xffffffff + ); Ok(()) } @@ -320,7 +364,6 @@ pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { return xlogptr / wal_segsz_bytes as u64; } - #[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { return format!( @@ -358,7 +401,6 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool { } } - #[derive(Debug, Clone)] struct FilePathError { msg: String, @@ -446,4 +488,3 @@ fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { return Ok((relnode, forknum, segno)); } - diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 8172b875da..b1daeaceae 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -50,12 +50,11 @@ pub struct WalStreamDecoder { recordbuf: BytesMut, } - #[derive(Error, Debug, Clone)] #[error("{msg} at {lsn}")] pub struct WalDecodeError { msg: String, - lsn: u64 + lsn: u64, } // @@ -100,7 +99,10 @@ impl WalStreamDecoder { let hdr = self.decode_XLogLongPageHeaderData(); if hdr.std.xlp_pageaddr != self.lsn { - return Err(WalDecodeError { msg: "invalid xlog segment header".into(), lsn: self.lsn }); + return Err(WalDecodeError { + msg: "invalid xlog segment header".into(), + lsn: self.lsn, + }); } // TODO: verify the remaining fields in the header @@ -115,7 +117,10 @@ impl WalStreamDecoder { let hdr = self.decode_XLogPageHeaderData(); if hdr.xlp_pageaddr != self.lsn { - return Err(WalDecodeError { msg: "invalid xlog page header".into(), lsn: self.lsn }); + return Err(WalDecodeError { + msg: "invalid xlog page header".into(), + lsn: self.lsn, + }); } // TODO: verify the remaining fields in the header @@ -141,7 +146,10 @@ impl WalStreamDecoder { self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - return Err(WalDecodeError {msg: format!("invalid xl_tot_len {}", xl_tot_len), lsn: self.lsn }); + return Err(WalDecodeError { + msg: format!("invalid xl_tot_len {}", xl_tot_len), + lsn: self.lsn, + }); } self.lsn += 4; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 23af8c2ee3..99c4142232 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -19,7 +19,7 @@ use postgres_types::PgLsn; use std::collections::HashMap; use std::fs; use std::fs::{File, OpenOptions}; -use std::io::{Write, Seek, SeekFrom}; +use std::io::{Seek, SeekFrom, Write}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Mutex; @@ -38,11 +38,16 @@ struct WalReceiverEntry { } lazy_static! { - static ref WAL_RECEIVERS: Mutex> = Mutex::new(HashMap::new()); + static ref WAL_RECEIVERS: Mutex> = + Mutex::new(HashMap::new()); } // Launch a new WAL receiver, or tell one that's running about change in connection string -pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) { +pub fn launch_wal_receiver( + conf: &PageServerConf, + timelineid: ZTimelineId, + wal_producer_connstr: &str, +) { let mut receivers = WAL_RECEIVERS.lock().unwrap(); match receivers.get_mut(&timelineid) { @@ -50,7 +55,9 @@ pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_p receiver.wal_producer_connstr = wal_producer_connstr.into(); } None => { - let receiver = WalReceiverEntry { wal_producer_connstr: wal_producer_connstr.into() }; + let receiver = WalReceiverEntry { + wal_producer_connstr: wal_producer_connstr.into(), + }; receivers.insert(timelineid, receiver); // Also launch a new thread to handle this connection @@ -59,7 +66,8 @@ pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_p .name("WAL receiver thread".into()) .spawn(move || { thread_main(&conf_copy, timelineid); - }).unwrap(); + }) + .unwrap(); } }; } @@ -68,14 +76,21 @@ pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_p fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { let receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.get(&timelineid).unwrap().wal_producer_connstr.clone() + receivers + .get(&timelineid) + .unwrap() + .wal_producer_connstr + .clone() } // // This is the entry point for the WAL receiver thread. // fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { - info!("WAL receiver thread started for timeline : '{}'", timelineid); + info!( + "WAL receiver thread started for timeline : '{}'", + timelineid + ); let runtime = runtime::Builder::new_current_thread() .enable_all() @@ -100,7 +115,11 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { }); } -async fn walreceiver_main(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) -> Result<(), Error> { +async fn walreceiver_main( + conf: &PageServerConf, + timelineid: ZTimelineId, + wal_producer_connstr: &str, +) -> Result<(), Error> { // Connect to the database in replication mode. info!("connecting to {:?}", wal_producer_connstr); let connect_cfg = format!("{} replication=true", wal_producer_connstr); @@ -174,10 +193,12 @@ async fn walreceiver_main(conf: &PageServerConf, timelineid: ZTimelineId, wal_pr let startlsn = xlog_data.wal_start(); let endlsn = startlsn + data.len() as u64; - write_wal_file(startlsn, - timelineid, - 16 * 1024 * 1024, // FIXME - data)?; + write_wal_file( + startlsn, + timelineid, + 16 * 1024 * 1024, // FIXME + data, + )?; trace!( "received XLogData between {:X}/{:X} and {:X}/{:X}", @@ -376,7 +397,6 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); } - fn write_wal_file( startpos: XLogRecPtr, timeline: ZTimelineId, @@ -409,12 +429,13 @@ fn write_wal_file( /* Open file */ let segno = XLByteToSeg(start_pos, wal_seg_size); - let wal_file_name = XLogFileName(1, // FIXME: always use Postgres timeline 1 - segno, wal_seg_size); - let wal_file_path = wal_dir - .join(wal_file_name.clone()); - let wal_file_partial_path = wal_dir - .join(wal_file_name.clone() + ".partial"); + let wal_file_name = XLogFileName( + 1, // FIXME: always use Postgres timeline 1 + segno, + wal_seg_size, + ); + let wal_file_path = wal_dir.join(wal_file_name.clone()); + let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial"); { let mut wal_file: File; @@ -422,8 +443,7 @@ fn write_wal_file( if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { wal_file = file; partial = false; - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) - { + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { /* Try to open existed partial file */ wal_file = file; partial = true; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9b0010a1be..06ac25286b 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -19,10 +19,10 @@ use std::assert; use std::cell::RefCell; use std::fs; use std::io::Error; +use std::process::Stdio; use std::sync::Arc; use std::time::Duration; use std::time::Instant; -use std::process::Stdio; use tokio::io::AsyncBufReadExt; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::process::{Child, ChildStdin, ChildStdout, Command}; @@ -34,8 +34,8 @@ use bytes::{BufMut, Bytes, BytesMut}; use crate::page_cache; use crate::page_cache::CacheEntry; use crate::page_cache::WALRecord; -use crate::{page_cache::BufferTag, PageServerConf}; use crate::ZTimelineId; +use crate::{page_cache::BufferTag, PageServerConf}; static TIMEOUT: Duration = Duration::from_secs(20); diff --git a/postgres_ffi/build.rs b/postgres_ffi/build.rs index 97b3392b3e..dc3e1509c0 100644 --- a/postgres_ffi/build.rs +++ b/postgres_ffi/build.rs @@ -17,21 +17,18 @@ fn main() { // Tell cargo to invalidate the built crate whenever any of the // included header files changed. .parse_callbacks(Box::new(bindgen::CargoCallbacks)) - .whitelist_type("ControlFileData") .whitelist_var("PG_CONTROL_FILE_SIZE") .whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .whitelist_type("DBState") - - // Path the server include dir. It is in tmp_install/include/server, if you did - // "configure --prefix=". But if you used "configure --prefix=/", - // and used DESTDIR to move it into tmp_install, then it's in - // tmp_install/include/postgres/server (that's how the pgbuild.sh script does it). - // 'pg_config --includedir-server' would perhaps be the more proper way to find it, - // but this will do for now. + // Path the server include dir. It is in tmp_install/include/server, if you did + // "configure --prefix=". But if you used "configure --prefix=/", + // and used DESTDIR to move it into tmp_install, then it's in + // tmp_install/include/postgres/server (that's how the pgbuild.sh script does it). + // 'pg_config --includedir-server' would perhaps be the more proper way to find it, + // but this will do for now. .clang_arg("-I../tmp_install/include/server") .clang_arg("-I../tmp_install/include/postgresql/server") - // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. diff --git a/postgres_ffi/src/lib.rs b/postgres_ffi/src/lib.rs index b62114ea7d..b6cf6bdb2b 100644 --- a/postgres_ffi/src/lib.rs +++ b/postgres_ffi/src/lib.rs @@ -10,22 +10,19 @@ const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize; impl ControlFileData { - // Initialize an all-zeros ControlFileData struct pub fn new() -> ControlFileData { let controlfile: ControlFileData; let b = [0u8; SIZEOF_CONTROLDATA]; - controlfile = unsafe { - std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) - }; + controlfile = + unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) }; return controlfile; } } pub fn decode_pg_control(buf: Bytes) -> Result { - let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA]; buf.clone().copy_to_slice(&mut b); @@ -36,25 +33,23 @@ pub fn decode_pg_control(buf: Bytes) -> Result { data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]); let expectedcrc = crc32c::crc32c(&data_without_crc); - controlfile = unsafe { - std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) - }; + controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) }; if expectedcrc != controlfile.crc { - anyhow::bail!("invalid CRC in control file: expected {:08X}, was {:08X}", - expectedcrc, controlfile.crc); + anyhow::bail!( + "invalid CRC in control file: expected {:08X}, was {:08X}", + expectedcrc, + controlfile.crc + ); } Ok(controlfile) } pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes { - let b: [u8; SIZEOF_CONTROLDATA]; - b = unsafe { - std::mem::transmute::(controlfile) - }; + b = unsafe { std::mem::transmute::(controlfile) }; // Recompute the CRC let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC]; diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index fcc475826d..8dfa31e23b 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -9,8 +9,8 @@ use std::path::PathBuf; use std::thread; use std::{fs::File, fs::OpenOptions}; -use clap::{App, Arg}; use anyhow::Result; +use clap::{App, Arg}; use slog::Drain; diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 54e28cda5b..74e0f1d3b7 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -354,7 +354,8 @@ impl Serializer for SafeKeeperResponse { } lazy_static! { - pub static ref TIMELINES: Mutex>> = Mutex::new(HashMap::new()); + pub static ref TIMELINES: Mutex>> = + Mutex::new(HashMap::new()); } pub fn thread_main(conf: WalAcceptorConf) { @@ -450,11 +451,13 @@ impl Timeline { // Load and lock control file (prevent running more than one instance of safekeeper) fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); if shared_state.control_file.is_some() { - info!("control file for timeline {} is already open", self.timelineid); + info!( + "control file for timeline {} is already open", + self.timelineid + ); return Ok(()); } @@ -476,7 +479,8 @@ impl Timeline { Err(e) => { io_error!( "Control file {:?} is locked by some other process: {}", - &control_file_path, e + &control_file_path, + e ); } } @@ -501,7 +505,8 @@ impl Timeline { if my_info.format_version != SK_FORMAT_VERSION { io_error!( "Incompatible format version: {} vs. {}", - my_info.format_version, SK_FORMAT_VERSION + my_info.format_version, + SK_FORMAT_VERSION ); } shared_state.info = my_info; @@ -583,7 +588,10 @@ impl Connection { self.conf.listen_addr.port(), self.timeline().timelineid ); - info!("requesting page server to connect to us: start {} {}", ps_connstr, callme); + info!( + "requesting page server to connect to us: start {} {}", + ps_connstr, callme + ); let (client, connection) = connect(&ps_connstr, NoTls).await?; // The connection object performs the actual communication with the database, @@ -716,8 +724,14 @@ impl Connection { let rec_size = (end_pos - start_pos) as usize; assert!(rec_size <= MAX_SEND_SIZE); - debug!("received for {} bytes between {:X}/{:X} and {:X}/{:X}", - rec_size, start_pos >> 32, start_pos & 0xffffffff, end_pos >> 32, end_pos & 0xffffffff); + debug!( + "received for {} bytes between {:X}/{:X} and {:X}/{:X}", + rec_size, + start_pos >> 32, + start_pos & 0xffffffff, + end_pos >> 32, + end_pos & 0xffffffff + ); /* Receive message body */ self.inbuf.resize(rec_size, 0u8); @@ -1054,8 +1068,11 @@ impl Connection { self.stream.write_all(&self.outbuf[0..msg_size]).await?; start_pos += send_size as u64; - debug!("Sent WAL to page server up to {:X}/{:>08X}", - (end_pos>>32) as u32, end_pos as u32); + debug!( + "Sent WAL to page server up to {:X}/{:>08X}", + (end_pos >> 32) as u32, + end_pos as u32 + ); if XLogSegmentOffset(start_pos, wal_seg_size) != 0 { wal_file = Some(file); diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 1d0b5b73d4..53d1528a6b 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -3,13 +3,13 @@ use std::path::{Path, PathBuf}; use std::process::exit; use std::str::FromStr; -use clap::{App, Arg, ArgMatches, SubCommand}; use anyhow::Result; use anyhow::{anyhow, bail}; +use clap::{App, Arg, ArgMatches, SubCommand}; -use control_plane::{compute::ComputeControlPlane, local_env, storage}; use control_plane::local_env::LocalEnv; use control_plane::storage::PageServerNode; +use control_plane::{compute::ComputeControlPlane, local_env, storage}; use pageserver::ZTimelineId; @@ -34,34 +34,32 @@ fn main() -> Result<()> { .required(true); let matches = App::new("zenith") .about("Zenith CLI") - .subcommand(SubCommand::with_name("init") - .about("Initialize a new Zenith repository in current directory")) - .subcommand(SubCommand::with_name("branch") - .about("Create a new branch") - .arg(Arg::with_name("branchname") - .required(false) - .index(1)) - .arg(Arg::with_name("start-point") - .required(false) - .index(2))) + .subcommand( + SubCommand::with_name("init") + .about("Initialize a new Zenith repository in current directory"), + ) + .subcommand( + SubCommand::with_name("branch") + .about("Create a new branch") + .arg(Arg::with_name("branchname").required(false).index(1)) + .arg(Arg::with_name("start-point").required(false).index(2)), + ) .subcommand( SubCommand::with_name("pageserver") .about("Manage pageserver instance") .subcommand(SubCommand::with_name("status")) .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) + .subcommand(SubCommand::with_name("stop")), ) .subcommand( SubCommand::with_name("pg") .about("Manage postgres instances") .subcommand( SubCommand::with_name("create") - // .arg(name_arg.clone() - // .required(false) - // .help("name of this postgres instance (will be pgN if omitted)")) - .arg(Arg::with_name("timeline") - .required(false) - .index(1)) + // .arg(name_arg.clone() + // .required(false) + // .help("name of this postgres instance (will be pgN if omitted)")) + .arg(Arg::with_name("timeline").required(false).index(1)), ) .subcommand(SubCommand::with_name("list")) .subcommand(SubCommand::with_name("start").arg(name_arg.clone())) @@ -80,9 +78,11 @@ fn main() -> Result<()> { let repopath = PathBuf::from(zenith_repo_dir()); if !repopath.exists() { - bail!("Zenith repository does not exists in {}.\n\ + bail!( + "Zenith repository does not exists in {}.\n\ Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'", - repopath.display()); + repopath.display() + ); } // TODO: check that it looks like a zenith repository let env = match local_env::load_config(&repopath) { @@ -204,7 +204,6 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Ok(()) } - // "zenith init" - Initialize a new Zenith repository in current dir fn run_init_cmd(_args: ArgMatches) -> Result<()> { local_env::init()?; @@ -221,20 +220,22 @@ fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { } if let Some(startpoint_str) = args.value_of("start-point") { - let mut startpoint = parse_point_in_time(startpoint_str)?; if startpoint.lsn == 0 { // Find end of WAL on the old timeline let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?; - println!("branching at end of WAL: {:X}/{:X}", end_of_wal >> 32, end_of_wal & 0xffffffff); + println!( + "branching at end of WAL: {:X}/{:X}", + end_of_wal >> 32, + end_of_wal & 0xffffffff + ); startpoint.lsn = end_of_wal; } return local_env::create_branch(local_env, branchname, startpoint); - } else { panic!("Missing start-point"); } @@ -276,18 +277,22 @@ fn list_branches() -> Result<()> { // // fn parse_point_in_time(s: &str) -> Result { - let mut strings = s.split("@"); let name = strings.next().unwrap(); let lsn: Option; if let Some(lsnstr) = strings.next() { let mut s = lsnstr.split("/"); - let lsn_hi: u64 = s.next().ok_or(anyhow!("invalid LSN in point-in-time specification"))?.parse()?; - let lsn_lo: u64 = s.next().ok_or(anyhow!("invalid LSN in point-in-time specification"))?.parse()?; + let lsn_hi: u64 = s + .next() + .ok_or(anyhow!("invalid LSN in point-in-time specification"))? + .parse()?; + let lsn_lo: u64 = s + .next() + .ok_or(anyhow!("invalid LSN in point-in-time specification"))? + .parse()?; lsn = Some(lsn_hi << 32 | lsn_lo); - } - else { + } else { lsn = None } @@ -321,7 +326,7 @@ fn parse_point_in_time(s: &str) -> Result { if tlipath.exists() { let result = local_env::PointInTime { timelineid: ZTimelineId::from_str(name)?, - lsn: lsn.unwrap_or(0) + lsn: lsn.unwrap_or(0), }; return Ok(result); diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs index 3b833f4c2a..2d86ad041f 100644 --- a/zenith_utils/src/lib.rs +++ b/zenith_utils/src/lib.rs @@ -1,3 +1,2 @@ //! zenith_utils is intended to be a place to put code that is shared //! between other crates in this repository. -