diff --git a/Cargo.lock b/Cargo.lock index 85c299c7a8..ad2a518f22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0" [[package]] name = "async-trait" -version = "0.1.48" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea56748e10732c49404c153638a15ec3d6211ec5ff35d9bb20e13b93576adf" +checksum = "589652ce7ccb335d1e7ecb3be145425702b290dbcb7029bbeaae263fc1d87b48" dependencies = [ "proc-macro2", "quote", @@ -241,6 +241,30 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "bindgen" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5" +dependencies = [ + "bitflags", + "cexpr", + "cfg-if 0.1.10", + "clang-sys", + "clap", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + [[package]] name = "bitflags" version = "1.2.1" @@ -323,6 +347,15 @@ version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -348,6 +381,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "clang-sys" +version = "0.29.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.33.3" @@ -382,15 +426,22 @@ checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" name = "control_plane" version = "0.1.0" dependencies = [ - "home", + "anyhow", + "bytes", + "fs_extra", + "hex", "lazy_static", + "pageserver", "postgres", + "postgres_ffi", "rand 0.8.3", "regex", "serde", "serde_derive", + "tar", "tokio-postgres", "toml", + "walkeeper", ] [[package]] @@ -426,9 +477,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -543,6 +594,19 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "env_logger" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + [[package]] name = "event-listener" version = "2.5.1" @@ -564,6 +628,18 @@ dependencies = [ "instant", ] +[[package]] +name = "filetime" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall 0.2.6", + "winapi", +] + [[package]] name = "fnv" version = "1.0.7" @@ -606,10 +682,16 @@ dependencies = [ ] [[package]] -name = "futures" -version = "0.3.13" +name = "fs_extra" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "futures" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" dependencies = [ "futures-channel", "futures-core", @@ -622,9 +704,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" +checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" dependencies = [ "futures-core", "futures-sink", @@ -632,15 +714,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" +checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" [[package]] name = "futures-executor" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" +checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" dependencies = [ "futures-core", "futures-task", @@ -649,9 +731,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" +checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" [[package]] name = "futures-lite" @@ -670,9 +752,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" +checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -682,21 +764,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" +checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" [[package]] name = "futures-task" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" +checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" [[package]] name = "futures-util" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" +checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" dependencies = [ "futures-channel", "futures-core", @@ -744,6 +826,12 @@ dependencies = [ "wasi 0.10.0+wasi-snapshot-preview1", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "gloo-timers" version = "0.2.1" @@ -810,20 +898,11 @@ dependencies = [ "digest", ] -[[package]] -name = "home" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2456aef2e6b6a9784192ae780c0f15bc57df0e918585282325e8c8ac27737654" -dependencies = [ - "winapi", -] - [[package]] name = "http" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" dependencies = [ "bytes", "fnv", @@ -843,9 +922,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" +checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589" [[package]] name = "httpdate" @@ -853,6 +932,15 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +dependencies = [ + "quick-error", +] + [[package]] name = "hyper" version = "0.14.5" @@ -926,9 +1014,11 @@ version = "0.1.0" dependencies = [ "control_plane", "lazy_static", + "pageserver", "postgres", "rand 0.8.3", "tokio-postgres", + "walkeeper", ] [[package]] @@ -968,10 +1058,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "libc" -version = "0.2.92" +name = "lazycell" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56d855069fafbb9b344c0f962150cd2c1187975cb1c22c1522c240d8c4986714" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" + +[[package]] +name = "libloading" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" +dependencies = [ + "cc", + "winapi", +] [[package]] name = "lock_api" @@ -1088,6 +1194,16 @@ dependencies = [ "socket2", ] +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check", +] + [[package]] name = "ntapi" version = "0.3.6" @@ -1200,12 +1316,15 @@ dependencies = [ "crossbeam-channel", "daemonize", "fs2", + "fs_extra", "futures", + "hex", "lazy_static", "log", "postgres", "postgres-protocol", "postgres-types", + "postgres_ffi", "rand 0.8.3", "regex", "rust-s3", @@ -1214,6 +1333,7 @@ dependencies = [ "slog-scope", "slog-stdlog", "slog-term", + "tar", "termion", "thiserror", "tokio", @@ -1249,11 +1369,17 @@ dependencies = [ "cfg-if 1.0.0", "instant", "libc", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "smallvec", "winapi", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1369,6 +1495,20 @@ dependencies = [ "postgres-protocol", ] +[[package]] +name = "postgres_ffi" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "byteorder", + "bytes", + "chrono", + "crc32c", + "hex", + "rand 0.8.3", +] + [[package]] name = "ppv-lite86" version = "0.2.10" @@ -1396,6 +1536,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.9" @@ -1494,9 +1640,9 @@ checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" dependencies = [ "bitflags", ] @@ -1507,7 +1653,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8440d8acb4fd3d277125b4bd01a6f38aee8d814b3b5fc09b3f2b825d37d3fe8f" dependencies = [ - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1528,7 +1674,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ "getrandom 0.2.2", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1559,9 +1705,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" dependencies = [ "base64", "bytes", @@ -1645,6 +1791,12 @@ dependencies = [ "url", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.2.3" @@ -1794,6 +1946,12 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" + [[package]] name = "signal-hook-registry" version = "1.3.0" @@ -1914,9 +2072,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ce15dd3ed8aa2f8eeac4716d6ef5ab58b6b9256db41d7e1a0224c2788e8fd87" +checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb" dependencies = [ "proc-macro2", "quote", @@ -1929,6 +2087,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" +[[package]] +name = "tar" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0bcfbd6a598361fda270d82469fff3d65089dc33e175c9a131f7b4cd395f228" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.2.0" @@ -1938,7 +2107,7 @@ dependencies = [ "cfg-if 1.0.0", "libc", "rand 0.8.3", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "remove_dir_all", "winapi", ] @@ -1954,6 +2123,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + [[package]] name = "termion" version = "1.5.6" @@ -1962,7 +2140,7 @@ checksum = "077185e2eac69c3f8379a4298e1e07cd36beb962290d4a51199acf0fdc10607e" dependencies = [ "libc", "numtoa", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "redox_termios", ] @@ -2032,9 +2210,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134af885d758d645f0f0505c9a8b3f9bf8a348fd822e112ab5248138348f1722" +checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" dependencies = [ "autocfg", "bytes", @@ -2106,9 +2284,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f" +checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" dependencies = [ "bytes", "futures-core", @@ -2180,9 +2358,9 @@ checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" [[package]] name = "unicode-bidi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" dependencies = [ "matches", ] @@ -2292,6 +2470,7 @@ dependencies = [ "futures", "lazy_static", "log", + "pageserver", "postgres", "postgres-protocol", "rand 0.8.3", @@ -2418,6 +2597,15 @@ dependencies = [ "cc", ] +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +dependencies = [ + "libc", +] + [[package]] name = "wildmatch" version = "1.1.0" @@ -2464,6 +2652,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "xattr" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +dependencies = [ + "libc", +] + [[package]] name = "xml-rs" version = "0.8.3" @@ -2474,6 +2671,10 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" name = "zenith" version = "0.1.0" dependencies = [ + "anyhow", "clap", "control_plane", + "pageserver", + "postgres_ffi", + "walkeeper", ] diff --git a/Cargo.toml b/Cargo.toml index f4d6314283..3e9c59ce3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,5 @@ members = [ "walkeeper", "zenith", "control_plane", + "postgres_ffi", ] diff --git a/cli-v2-story.md b/cli-v2-story.md new file mode 100644 index 0000000000..1f213c903b --- /dev/null +++ b/cli-v2-story.md @@ -0,0 +1,188 @@ +Create a new Zenith repository in the current directory: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init + The files belonging to this database system will be owned by user "heikki". + This user must also own the server process. + + The database cluster will be initialized with locale "en_GB.UTF-8". + The default database encoding has accordingly been set to "UTF8". + The default text search configuration will be set to "english". + + Data page checksums are disabled. + + creating directory tmp ... ok + creating subdirectories ... ok + selecting dynamic shared memory implementation ... posix + selecting default max_connections ... 100 + selecting default shared_buffers ... 128MB + selecting default time zone ... Europe/Helsinki + creating configuration files ... ok + running bootstrap script ... ok + performing post-bootstrap initialization ... ok + syncing data to disk ... ok + + initdb: warning: enabling "trust" authentication for local connections + You can change this by editing pg_hba.conf or using the option -A, or + --auth-local and --auth-host, the next time you run initdb. + new zenith repository was created in .zenith + +Initially, there is only one branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + main + +Start a local Postgres instance on the branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432 + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432 + 2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432" + 2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status" + 2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0 + 2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required + 2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections + done + server started + +Run some commands against it: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" + CREATE TABLE + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + +Create a new branch called 'experimental'. We create it from the +current end of the 'main' branch, but you could specify a different +LSN as the start point instead. + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main + branching at end of WAL: 0/161F478 + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + experimental + main + +Start another Postgres instance off the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0 + 2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections + done + server started + +Insert some a row on the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + +See that the other Postgres instance is still running on 'main' branch on port 5432: + + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + + + +Everything is stored in the .zenith directory: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/ + total 12 + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines + +The 'datadirs' directory contains the datadirs of the running instances: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/ + total 8 + drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e + drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76 + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/ + total 124 + drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem + -rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf + -rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase + -rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION + lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact + -rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf + -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf + -rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts + -rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid + +Note how 'pg_wal' is just a symlink to the 'timelines' directory. The +datadir is ephemeral, you can delete it at any time, and it can be reconstructed +from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull +the repository, the 'datadirs' are not included. (They are like git working trees) + + ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres + ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/* + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0 + 2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections + done + server started + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 7281595c18..0d49488bd7 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,12 +8,20 @@ edition = "2018" [dependencies] rand = "0.8.3" +tar = "0.4.33" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } serde = "" serde_derive = "" toml = "" -home = "0.5.3" lazy_static = "" regex = "1" +anyhow = "1.0" +hex = "0.4.3" +bytes = "1.0.1" +fs_extra = "1.2.0" + +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 5c3ec5e816..8157c62a8b 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,21 +1,24 @@ -use std::error; use std::fs::File; use std::fs::{self, OpenOptions}; +use std::os::unix::fs::PermissionsExt; use std::net::TcpStream; -use std::process::{Command, Stdio}; +use std::process::Command; use std::sync::Arc; use std::time::Duration; use std::{collections::BTreeMap, path::PathBuf}; -use std::{io::Write, net::SocketAddr}; +use std::io::{Read, Write}; +use std::net::SocketAddr; -use lazy_static::lazy_static; -use postgres::{Client, NoTls}; use regex::Regex; +use lazy_static::lazy_static; +use tar; +use anyhow::{Context, Result}; -use crate::local_env::{self, LocalEnv}; +use postgres::{Client, NoTls}; + +use crate::local_env::LocalEnv; use crate::storage::{PageServerNode, WalProposerNode}; - -type Result = std::result::Result>; +use pageserver::ZTimelineId; // // ComputeControlPlane @@ -34,14 +37,9 @@ impl ComputeControlPlane { // it is running on default port. Change that when pageserver will have config. let pageserver = Arc::new(PageServerNode::from_env(&env)); - let nodes: Result> = fs::read_dir(env.compute_dir()) - .map_err(|e| { - format!( - "failed to list {}: {}", - env.compute_dir().to_str().unwrap(), - e - ) - })? + let pgdatadirspath = env.repo_path.join("pgdatadirs"); + let nodes: Result> = fs::read_dir(&pgdatadirspath) + .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? .into_iter() .map(|f| { PostgresNode::from_dir_entry(f?, &env, &pageserver) @@ -67,43 +65,46 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - pub fn local(pageserver: &Arc) -> ComputeControlPlane { - let env = local_env::test_env(); + pub fn local(local_env: &LocalEnv, pageserver: &Arc) -> ComputeControlPlane { ComputeControlPlane { base_port: 65431, pageserver: Arc::clone(pageserver), nodes: BTreeMap::new(), - env, + env: local_env.clone(), } } - fn new_vanilla_node(&mut self, is_test: bool) -> Result> { - // allocate new node entry with generated port + // Connect to a page server, get base backup, and untar it to initialize a + // new data directory + pub fn new_from_page_server(&mut self, is_test: bool, timelineid: ZTimelineId) -> Result> { let node_id = self.nodes.len() as u32 + 1; + let node = Arc::new(PostgresNode { name: format!("pg{}", node_id), address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test, + timelineid }); - node.init_vanilla()?; + + node.init_from_page_server()?; self.nodes.insert(node.name.clone(), Arc::clone(&node)); Ok(node) } - pub fn new_test_node(&mut self) -> Arc { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(true).unwrap(); + pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid); + assert!(node.is_ok()); + let node = node.unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -111,9 +112,9 @@ impl ComputeControlPlane { node } - pub fn new_test_master_node(&mut self) -> Arc { - let node = self.new_vanilla_node(true).unwrap(); - println!("Create vanilla node at {:?}", node.address); + pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid).unwrap(); + node.append_conf( "postgresql.conf", "synchronous_standby_names = 'safekeeper_proxy'\n", @@ -122,17 +123,15 @@ impl ComputeControlPlane { node } - pub fn new_node(&mut self) -> Result> { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(false)?; + pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result> { + let node = self.new_from_page_server(false, timelineid).unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -149,6 +148,7 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, + timelineid: ZTimelineId, } impl PostgresNode { @@ -158,11 +158,8 @@ impl PostgresNode { pageserver: &Arc, ) -> Result { if !entry.file_type()?.is_dir() { - let err_msg = format!( - "PostgresNode::from_dir_entry failed: '{}' is not a directory", - entry.path().to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("PostgresNode::from_dir_entry failed: '{}' is not a directory", + entry.path().display()); } lazy_static! { @@ -175,13 +172,9 @@ impl PostgresNode { // find out tcp port in config file let cfg_path = entry.path().join("postgresql.conf"); - let config = fs::read_to_string(cfg_path.clone()).map_err(|e| { - format!( - "failed to read config file in {}: {}", - cfg_path.to_str().unwrap(), - e - ) - })?; + let config = fs::read_to_string(cfg_path.clone()) + .with_context(|| format!("failed to read config file in {}", + cfg_path.to_str().unwrap()))?; let err_msg = format!( "failed to find port definition in config file {}", @@ -189,14 +182,20 @@ impl PostgresNode { ); let port: u16 = CONF_PORT_RE .captures(config.as_str()) - .ok_or(err_msg.clone() + " 1")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))? .iter() .last() - .ok_or(err_msg.clone() + " 3")? - .ok_or(err_msg.clone() + " 3")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))? .as_str() .parse() - .map_err(|e| format!("{}: {}", err_msg, e))?; + .with_context(|| err_msg)?; + + // FIXME: What timeline is this server on? Would have to parse the postgresql.conf + // file for that, too. It's currently not needed for anything, but it would be + // nice to list the timeline in "zenith pg list" + let timelineid_buf = [0u8; 16]; + let timelineid = ZTimelineId::from(timelineid_buf); // ok now Ok(PostgresNode { @@ -205,38 +204,48 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, + timelineid }) } - fn init_vanilla(&self) -> Result<()> { + // Connect to a page server, get base backup, and untar it to initialize a + // new data directory + pub fn init_from_page_server(&self) -> Result<()> { + + let pgdata = self.pgdata(); + println!( - "Creating new postgres: path={} port={}", - self.pgdata().to_str().unwrap(), + "Extracting base backup to create postgres instance: path={} port={}", + pgdata.to_str().unwrap(), self.address.port() ); // initialize data directory - if self.is_test { - fs::remove_dir_all(self.pgdata().to_str().unwrap()).ok(); + fs::remove_dir_all(&pgdata).ok(); } - fs::create_dir_all(self.pgdata().to_str().unwrap())?; + let sql = format!("basebackup {}", self.timelineid); + let mut client = self.pageserver.page_server_psql_client()?; + println!("connected to page server"); - let initdb_path = self.env.pg_bin_dir().join("initdb"); - let initdb = Command::new(initdb_path) - .args(&["-D", self.pgdata().to_str().unwrap()]) - .arg("-N") - .arg("-A trust") - .arg("--no-instructions") - .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .status()?; + fs::create_dir_all(&pgdata)?; + fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).unwrap(); - if !initdb.success() { - return Err("initdb failed".into()); - } + // Also create pg_wal directory, it's not included in the tarball + // FIXME: actually, it is currently. + //fs::create_dir_all(pgdata.join("pg_wal"))?; + + let mut copyreader = client.copy_out(sql.as_str())?; + + // FIXME: Currently, we slurp the whole tarball into memory, and then extract it, + // but we really should do this: + //let mut ar = tar::Archive::new(copyreader); + let mut buf = vec![]; + copyreader.read_to_end(&mut buf)?; + println!("got tarball of size {}", buf.len()); + let mut ar = tar::Archive::new(buf.as_slice()); + ar.unpack(&pgdata)?; // listen for selected port self.append_conf( @@ -256,12 +265,33 @@ impl PostgresNode { .as_str(), ); - println!("Database initialized"); + // Never clean up old WAL. TODO: We should use a replication + // slot or something proper, to prevent the compute node + // from removing WAL that hasn't been streamed to the safekeepr or + // page server yet. But this will do for now. + self.append_conf("postgresql.conf", + format!("wal_keep_size='10TB'\n") + .as_str(), + ); + + // Connect it to the page server. + + // Configure that node to take pages from pageserver + self.append_conf("postgresql.conf", + format!("page_server_connstring = 'host={} port={}'\n\ + zenith_timeline='{}'\n", + self.pageserver.address().ip(), + self.pageserver.address().port(), + self.timelineid, + ) + .as_str(), + ); + Ok(()) } - pub fn pgdata(&self) -> PathBuf { - self.env.compute_dir().join(self.name.clone()) + fn pgdata(&self) -> PathBuf { + self.env.repo_path.join("pgdatadirs").join(&self.name) } pub fn status(&self) -> &str { @@ -306,16 +336,13 @@ impl PostgresNode { .status()?; if !pg_ctl.success() { - Err("pg_ctl failed".into()) + anyhow::bail!("pg_ctl failed"); } else { Ok(()) } } pub fn start(&self) -> Result<()> { - let _res = self - .pageserver - .page_server_psql(format!("callmemaybe {}", self.connstr()).as_str()); println!("Starting postgres node at '{}'", self.connstr()); self.pg_ctl(&["start"]) } @@ -405,12 +432,10 @@ impl PostgresNode { .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") - .stderr( - OpenOptions::new() - .append(true) - .open(self.env.data_dir.join("safepkeeper_proxy.log")) - .unwrap(), - ) + .stderr(OpenOptions::new() + .append(true) + .open(self.env.repo_path.join("safepkeeper_proxy.log")) + .unwrap()) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 241fba2f62..ebbcba7f26 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,13 +5,18 @@ // script which will use local paths. // use std::env; -use std::error; use std::fs; use std::path::{Path, PathBuf}; +use std::process::Command; +use bytes::Bytes; +use rand::Rng; +use hex; use serde_derive::{Deserialize, Serialize}; +use anyhow::Result; -type Result = std::result::Result>; +use walkeeper::xlog_utils; +use pageserver::ZTimelineId; // // This data structure represents deserialized zenith config, which should be @@ -21,11 +26,11 @@ type Result = std::result::Result>; // #[derive(Serialize, Deserialize, Clone)] pub struct LocalEnv { - // Here page server and compute nodes will create and store their data. - pub data_dir: PathBuf, + // Path to the Repository. Here page server and compute nodes will create and store their data. + pub repo_path: PathBuf, - // Path to postgres distribution. It expected that "bin", "include", - // "lib", "share" from postgres distribution will be there. If at some point + // Path to postgres distribution. It's expected that "bin", "include", + // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. pub pg_distrib_dir: PathBuf, @@ -42,53 +47,33 @@ impl LocalEnv { pub fn pg_lib_dir(&self) -> PathBuf { self.pg_distrib_dir.join("lib") } +} - // pageserver - pub fn pageserver_data_dir(&self) -> PathBuf { - self.data_dir.join("pageserver") - } - pub fn pageserver_log(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.log") - } - pub fn pageserver_pidfile(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.pid") - } - - // compute nodes - pub fn compute_dir(&self) -> PathBuf { - self.data_dir.join("compute") +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), } } // -// Issues in rust-lang repo has several discussions about proper library to check -// home directory in a cross-platform way. Seems that current consensus is around -// home crate and cargo uses it. +// Initialize a new Zenith repository // -fn get_home() -> Result { - home::home_dir().ok_or("can not determine home directory path".into()) -} - pub fn init() -> Result<()> { - let home_dir = get_home()?; - // check if config already exists - let cfg_path = home_dir.join(".zenith"); - if cfg_path.exists() { - let err_msg = format!( - "{} already exists. Perhaps already initialized?", - cfg_path.to_str().unwrap() - ); - return Err(err_msg.into()); + let repo_path = PathBuf::from(zenith_repo_dir()); + if repo_path.exists() { + anyhow::bail!("{} already exists. Perhaps already initialized?", + repo_path.to_str().unwrap()); } // Now we can run init only from crate directory, so check that current dir is our crate. // Use 'pageserver/Cargo.toml' existence as evidendce. let cargo_path = env::current_dir()?; if !cargo_path.join("pageserver/Cargo.toml").exists() { - let err_msg = "Current dirrectory does not look like a zenith repo. \ - Please, run 'init' from zenith repo root."; - return Err(err_msg.into()); + anyhow::bail!("Current dirrectory does not look like a zenith repo. \ + Please, run 'init' from zenith repo root."); } // ok, now check that expected binaries are present @@ -97,81 +82,145 @@ pub fn init() -> Result<()> { let pg_distrib_dir = cargo_path.join("tmp_install"); let pg_path = pg_distrib_dir.join("bin/postgres"); if !pg_path.exists() { - let err_msg = format!( - "Can't find postres binary at {}. \ - Perhaps './pgbuild.sh' is needed to build it first.", - pg_path.to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("Can't find postres binary at {}. \ + Perhaps './pgbuild.sh' is needed to build it first.", + pg_path.to_str().unwrap()); } // check pageserver let zenith_distrib_dir = cargo_path.join("target/debug/"); let pageserver_path = zenith_distrib_dir.join("pageserver"); if !pageserver_path.exists() { - let err_msg = format!( - "Can't find pageserver binary at {}. Please build it.", - pageserver_path.to_str().unwrap() - ); - return Err(err_msg.into()); + anyhow::bail!("Can't find pageserver binary at {}. Please build it.", + pageserver_path.to_str().unwrap()); } // ok, we are good to go - - // create dirs - let data_dir = cargo_path.join("tmp_check_cli"); - - for &dir in &["compute", "pageserver"] { - fs::create_dir_all(data_dir.join(dir)).map_err(|e| { - format!( - "Failed to create directory in '{}': {}", - data_dir.to_str().unwrap(), - e - ) - })?; - } - - // write config let conf = LocalEnv { - data_dir, + repo_path: repo_path.clone(), pg_distrib_dir, zenith_distrib_dir, }; + init_repo(&conf)?; + + // write config let toml = toml::to_string(&conf)?; - fs::write(cfg_path, toml)?; + fs::write(repo_path.join("config"), toml)?; + + Ok(()) +} + +pub fn init_repo(local_env: &LocalEnv) -> Result<()> +{ + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + fs::create_dir(&repopath)?; + fs::create_dir(repopath.clone() + "/pgdatadirs")?; + fs::create_dir(repopath.clone() + "/timelines")?; + fs::create_dir(repopath.clone() + "/refs")?; + fs::create_dir(repopath.clone() + "/refs/branches")?; + fs::create_dir(repopath.clone() + "/refs/tags")?; + + // Create empty config file + let configpath = repopath.clone() + "/config"; + fs::write(&configpath, r##" +# Example config file. Nothing here yet. +"##) + .expect(&format!("Unable to write file {}", &configpath)); + + // Create initial timeline + let tli = create_timeline(&local_env, None)?; + let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + + // Run initdb + // + // FIXME: we create it temporarily in "tmp" directory, and move it into + // the repository. Use "tempdir()" or something? Or just create it directly + // in the repo? + let initdb_path = local_env.pg_bin_dir().join("initdb"); + let _initdb = + Command::new(initdb_path) + .args(&["-D", "tmp", "--no-instructions"]) + .status() + .expect("failed to execute initdb"); + + // Read control file to extract the LSN + let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; + + let lsn = controlfile.checkPoint; + let lsnstr = format!("{:016X}", lsn); + + // Move the initial WAL file + fs::rename("tmp/pg_wal/000000010000000000000001", timelinedir.clone() + "/wal/000000010000000000000001.partial")?; + + // Remove pg_wal + fs::remove_dir_all("tmp/pg_wal")?; + + force_crash_recovery(&PathBuf::from("tmp"))?; + + let target = timelinedir.clone() + "/snapshots/" + &lsnstr; + fs::rename("tmp", target)?; + + // Create 'main' branch to refer to the initial timeline + let data = hex::encode(tli); + fs::write(repopath.clone() + "/refs/branches/main", data)?; + + println!("new zenith repository was created in {}", &repopath); + Ok(()) +} + + +// If control file says the cluster was shut down cleanly, modify it, to mark +// it as crashed. That forces crash recovery when you start the cluster. +// +// FIXME: +// We currently do this to the initial snapshot in "zenith init". It would +// be more natural to do this when the snapshot is restored instead, but we +// currently don't have any code to create new snapshots, so it doesn't matter +// Or better yet, use a less hacky way of putting the cluster into recovery. +// Perhaps create a backup label file in the data directory when it's restored. +fn force_crash_recovery(datadir: &Path) -> Result<()> { + + // Read in the control file + let mut controlfilepath = datadir.to_path_buf(); + controlfilepath.push("global"); + controlfilepath.push("pg_control"); + let mut controlfile = postgres_ffi::decode_pg_control( + Bytes::from(fs::read(controlfilepath.as_path())?))?; + + controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION; + + fs::write(controlfilepath.as_path(), + postgres_ffi::encode_pg_control(controlfile))?; Ok(()) } // check that config file is present -pub fn load_config() -> Result { - // home - let home_dir = get_home()?; - - // check file exists - let cfg_path = home_dir.join(".zenith"); - if !cfg_path.exists() { - let err_msg = format!( - "Zenith config is not found in {}. You need to run 'zenith init' first", - cfg_path.to_str().unwrap() - ); - return Err(err_msg.into()); +pub fn load_config(repopath: &Path) -> Result { + if !repopath.exists() { + anyhow::bail!("Zenith config is not found in {}. You need to run 'zenith init' first", + repopath.to_str().unwrap()); } // load and parse file - let config = fs::read_to_string(cfg_path)?; + let config = fs::read_to_string(repopath.join("config"))?; toml::from_str(config.as_str()).map_err(|e| e.into()) } // local env for tests -pub fn test_env() -> LocalEnv { - let data_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check"); - fs::create_dir_all(data_dir.clone()).unwrap(); - LocalEnv { - data_dir, +pub fn test_env(testname: &str) -> LocalEnv { + let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check/").join(testname); + + // Remove remnants of old test repo + let _ = fs::remove_dir_all(&repo_path); + + let local_env = LocalEnv { + repo_path, pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"), zenith_distrib_dir: cargo_bin_dir(), - } + }; + init_repo(&local_env).unwrap(); + return local_env; } // Find the directory where the binaries were put (i.e. target/debug/) @@ -185,3 +234,103 @@ pub fn cargo_bin_dir() -> PathBuf { return pathbuf; } + +#[derive(Debug, Clone, Copy)] +pub struct PointInTime { + pub timelineid: ZTimelineId, + pub lsn: u64 +} + +fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result<[u8; 16]> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + // Create initial timeline + let mut tli = [0u8; 16]; + rand::thread_rng().fill(&mut tli); + + let timelinedir = format!("{}/timelines/{}", repopath, &hex::encode(tli)); + + fs::create_dir(timelinedir.clone())?; + fs::create_dir(timelinedir.clone() + "/snapshots")?; + fs::create_dir(timelinedir.clone() + "/wal")?; + + if let Some(ancestor) = ancestor { + let data = format!("{}@{:X}/{:X}", + hex::encode(ancestor.timelineid.to_str()), + ancestor.lsn >> 32, + ancestor.lsn & 0xffffffff); + fs::write(timelinedir + "/ancestor", data)?; + } + + Ok(tli) +} + +// Parse an LSN in the format used in filenames +// +// For example: 00000000015D3DD8 +// +fn parse_lsn(s: &str) -> std::result::Result { + u64::from_str_radix(s, 16) +} + +// Create a new branch in the repository (for the "zenith branch" subcommand) +pub fn create_branch(local_env: &LocalEnv, branchname: &str, startpoint: PointInTime) -> Result<()> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + // create a new timeline for it + let newtli = create_timeline(local_env, Some(startpoint))?; + let newtimelinedir = format!("{}/timelines/{}", repopath, &hex::encode(newtli)); + + let data = hex::encode(newtli); + fs::write(format!("{}/refs/branches/{}", repopath, branchname), data)?; + + // Copy the latest snapshot (TODO: before the startpoint) and all WAL + // TODO: be smarter and avoid the copying... + let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?; + let copy_opts = fs_extra::dir::CopyOptions::new(); + fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.clone() + "/snapshots", ©_opts)?; + + let oldtimelinedir = format!("{}/timelines/{}", &repopath, startpoint.timelineid.to_str()); + let mut copy_opts = fs_extra::dir::CopyOptions::new(); + copy_opts.content_only = true; + fs_extra::dir::copy(oldtimelinedir + "/wal/", + newtimelinedir.clone() + "/wal", + ©_opts)?; + + Ok(()) +} + +// Find the end of valid WAL in a wal directory +pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + let waldir = PathBuf::from(format!("{}/timelines/{}/wal", repopath, timeline.to_str())); + + let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true); + + return Ok(lsn); +} + +// Find the latest snapshot for a timeline +fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> { + let repopath = String::from(local_env.repo_path.to_str().unwrap()); + + let timelinedir = repopath + "/timelines/" + &timeline.to_str(); + let snapshotsdir = timelinedir.clone() + "/snapshots"; + let paths = fs::read_dir(&snapshotsdir).unwrap(); + let mut maxsnapshot: u64 = 0; + let mut snapshotdir: Option = None; + for path in paths { + let path = path.unwrap(); + let filename = path.file_name().to_str().unwrap().to_owned(); + if let Ok(lsn) = parse_lsn(&filename) { + maxsnapshot = std::cmp::max(lsn, maxsnapshot); + snapshotdir = Some(path.path()); + } + } + if maxsnapshot == 0 { + // TODO: check ancestor timeline + anyhow::bail!("no snapshot found in {}", snapshotsdir); + } + + Ok((maxsnapshot, snapshotdir.unwrap())) +} diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index eba2966849..dd935cb4fb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,4 +1,3 @@ -use std::error; use std::fs; use std::io; use std::net::SocketAddr; @@ -9,13 +8,13 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; +use anyhow::Result; use postgres::{Client, NoTls}; +use crate::local_env::LocalEnv; use crate::compute::PostgresNode; -use crate::local_env::{self, LocalEnv}; - -type Result = std::result::Result>; +use pageserver::ZTimelineId; // // Collection of several example deployments useful for tests. @@ -27,63 +26,72 @@ pub struct TestStorageControlPlane { pub wal_acceptors: Vec, pub pageserver: Arc, pub test_done: AtomicBool, + pub repopath: PathBuf, } impl TestStorageControlPlane { + + // Peek into the repository, to grab the timeline ID of given branch + pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId { + + let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() + } + // postgres <-> page_server - pub fn one_page_server(pgdata_base_path: String) -> TestStorageControlPlane { - let env = local_env::test_env(); + // + // Initialize a new repository and configure a page server to run in it + // + pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); - - if pgdata_base_path.is_empty() { - pserver.start().unwrap(); - } else { - pserver.start_fromdatadir(pgdata_base_path).unwrap(); - } + pserver.start().unwrap(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } - pub fn one_page_server_no_start() -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env, + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } // postgres <-> {wal_acceptor1, wal_acceptor2, ...} - pub fn fault_tolerant(redundancy: usize) -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); + let mut cplane = TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }), test_done: AtomicBool::new(false), + repopath: repopath, }; - cplane.pageserver.init(); cplane.pageserver.start().unwrap(); const WAL_ACCEPTOR_PORT: usize = 54321; @@ -93,8 +101,8 @@ impl TestStorageControlPlane { listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i) .parse() .unwrap(), - data_dir: env.data_dir.join(format!("wal_acceptor_{}", i)), - env: env.clone(), + data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)), + env: local_env.clone(), }; wal_acceptor.init(); wal_acceptor.start(); @@ -153,58 +161,46 @@ impl PageServerNode { } } - pub fn init(&self) { - fs::create_dir_all(self.env.pageserver_data_dir()).unwrap(); + pub fn repo_path(&self) -> PathBuf { + self.env.repo_path.clone() + } + + pub fn pid_file(&self) -> PathBuf { + self.env.repo_path.join("pageserver.pid") } pub fn start(&self) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); + println!("Starting pageserver at '{}' in {}", self.address(), self.repo_path().display()); - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) + let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver")); + cmd .args(&["-l", self.address().to_string().as_str()]) .arg("-d") .env_clear() + .env("ZENITH_REPO_DIR", self.repo_path()) .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status()?; + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); - if !status.success() { - return Err(Box::::from(format!( - "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + if !cmd.status()?.success() { + anyhow::bail!("Pageserver failed to start. See '{}' for details.", + self.repo_path().join("pageserver.log").display()); } - } - pub fn start_fromdatadir(&self, pgdata_base_path: String) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); - - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) - .arg("-d") - .args(&["--restore-from", "local"]) - .env_clear() - .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("PGDATA_BASE_PATH", pgdata_base_path) - .status()?; - - if !status.success() { - return Err(Box::::from(format!( - "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + // It takes a while for the page server to start up. Wait until it is + // open for business. + for retries in 1..15 { + let client = self.page_server_psql_client(); + if client.is_ok() { + break; + } else { + println!("page server not responding yet, retrying ({})...", retries); + thread::sleep(Duration::from_secs(1)); + } } + Ok(()) } pub fn stop(&self) -> Result<()> { - let pidfile = self.env.pageserver_pidfile(); + let pidfile = self.pid_file(); let pid = read_pidfile(&pidfile)?; let status = Command::new("kill") @@ -214,10 +210,7 @@ impl PageServerNode { .expect("failed to execute kill"); if !status.success() { - return Err(Box::::from(format!( - "Failed to kill pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to kill pageserver with pid {}", pid); } // await for pageserver stop @@ -232,10 +225,7 @@ impl PageServerNode { // ok, we failed to stop pageserver, let's panic if !status.success() { - return Err(Box::::from(format!( - "Failed to stop pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to stop pageserver with pid {}", pid); } else { return Ok(()); } @@ -254,6 +244,17 @@ impl PageServerNode { println!("Pageserver query: '{}'", sql); client.simple_query(sql).unwrap() } + + pub fn page_server_psql_client(&self) -> std::result::Result { + let connstring = format!( + "host={} port={} dbname={} user={}", + self.address().ip(), + self.address().port(), + "no_db", + "no_user", + ); + Client::connect(connstring.as_str(), NoTls) + } } impl Drop for PageServerNode { diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml index b201b1849e..51f9d0c773 100644 --- a/integration_tests/Cargo.toml +++ b/integration_tests/Cargo.toml @@ -12,4 +12,6 @@ rand = "0.8.3" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } diff --git a/integration_tests/tests/test_compute.rs b/integration_tests/tests/test_compute.rs index 955b7ffa5e..f4cf38432e 100644 --- a/integration_tests/tests/test_compute.rs +++ b/integration_tests/tests/test_compute.rs @@ -1,7 +1,11 @@ // test node resettlement to an empty datadir + +// TODO +/* #[test] fn test_resettlement() {} // test seq scan of everythin after restart #[test] fn test_cold_seqscan() {} +*/ diff --git a/integration_tests/tests/test_control_plane.rs b/integration_tests/tests/test_control_plane.rs index 481cd3d8b3..8724d5fda1 100644 --- a/integration_tests/tests/test_control_plane.rs +++ b/integration_tests/tests/test_control_plane.rs @@ -1,5 +1,8 @@ +// TODO +/* #[test] fn test_actions() {} #[test] fn test_regress() {} +*/ diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index c0959ebdbb..14c328be0e 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -1,23 +1,24 @@ // mod control_plane; use control_plane::compute::ComputeControlPlane; use control_plane::storage::TestStorageControlPlane; - -use std::thread::sleep; -use std::time::Duration; +use control_plane::local_env; +use control_plane::local_env::PointInTime; // XXX: force all redo at the end // -- restart + seqscan won't read deleted stuff // -- pageserver api endpoint to check all rels - -// Handcrafted cases with wal records that are (were) problematic for redo. +/* #[test] fn test_redo_cases() { + let local_env = local_env::test_env("test_redo_cases"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); // check basic work with table @@ -47,32 +48,46 @@ fn test_redo_cases() { println!("sum = {}", count); assert_eq!(count, 5000050000); } - +*/ // Runs pg_regress on a compute node #[test] #[ignore] fn test_regress() { + let local_env = local_env::test_env("test_regress"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); control_plane::storage::regress_check(&node); } -// Run two postgres instances on one pageserver +// Run two postgres instances on one pageserver, on different timelines #[test] -fn test_pageserver_multitenancy() { - // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); +fn test_pageserver_two_timelines() { + let local_env = local_env::test_env("test_pageserver_two_timelines"); - // Allocate postgres instance, but don't start - let node1 = compute_cplane.new_test_node(); - let node2 = compute_cplane.new_test_node(); + // Start pageserver that reads WAL directly from that postgres + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); + + let maintli = storage_cplane.get_branch_timeline("main"); + + // Create new branch at the end of 'main' + let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); + local_env::create_branch(&local_env, "experimental", + PointInTime { timelineid: maintli, + lsn: startpoint }).unwrap(); + let experimentaltli = storage_cplane.get_branch_timeline("experimental"); + + // Launch postgres instances on both branches + let node1 = compute_cplane.new_test_node(maintli); + let node2 = compute_cplane.new_test_node(experimentaltli); node1.start().unwrap(); node2.start().unwrap(); @@ -110,36 +125,3 @@ fn test_pageserver_multitenancy() { println!("sum = {}", count); assert_eq!(count, 15000150000); } - -#[test] -fn test_upload_pageserver_local() { - // Init pageserver that reads WAL directly from that postgres - // Don't start yet - - let storage_cplane = TestStorageControlPlane::one_page_server_no_start(); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); - - // init postgres node - let node = compute_cplane.new_test_node(); - - //upload data to pageserver & start it - &storage_cplane - .pageserver - .start_fromdatadir(node.pgdata().to_str().unwrap().to_string()) - .unwrap(); - - sleep(Duration::from_secs(10)); - - // start postgres node - node.start().unwrap(); - - // check basic work with table - node.safe_psql( - "postgres", - "CREATE TABLE t(key int primary key, value text)", - ); - node.safe_psql( - "postgres", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ); -} diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index 408f991bb2..316a098afe 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -1,6 +1,7 @@ // Restart acceptors one by one while compute is under the load. use control_plane::compute::ComputeControlPlane; use control_plane::storage::TestStorageControlPlane; +use control_plane::local_env; use rand::Rng; use std::sync::Arc; @@ -9,14 +10,16 @@ use std::{thread, time}; #[test] fn test_acceptors_normal_work() { - // Start pageserver that reads WAL directly from that postgres + let local_env = local_env::test_env("test_acceptors_normal_work"); + const REDUNDANCY: usize = 3; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -91,17 +94,20 @@ fn test_multitenancy() { // Majority is always alive #[test] fn test_acceptors_restarts() { + let local_env = local_env::test_env("test_acceptors_restarts"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; const FAULT_PROBABILITY: f32 = 0.01; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); let mut rng = rand::thread_rng(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -150,16 +156,19 @@ fn start_acceptor(cplane: &Arc, no: usize) { // them again and check that nothing was losed. Repeat. // N_CRASHES env var #[test] -fn test_acceptors_unavalability() { +fn test_acceptors_unavailability() { + let local_env = local_env::test_env("test_acceptors_unavailability"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 2; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy @@ -226,15 +235,18 @@ fn simulate_failures(cplane: Arc) { // Race condition test #[test] fn test_race_conditions() { + let local_env = local_env::test_env("test_race_conditions"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; - let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(REDUNDANCY)); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY)); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f7f3be7f47..69f6ce61ab 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -14,6 +14,7 @@ regex = "1.4.5" bytes = "1.0.1" byteorder = "1.4.3" fs2 = "0.4.3" +fs_extra = "1.2.0" futures = "0.3.13" lazy_static = "1.4.0" slog-stdlog = "4.1.0" @@ -37,3 +38,7 @@ anyhow = "1.0" crc32c = "0.6.0" walkdir = "2" thiserror = "1.0" +hex = "0.4.3" +tar = "0.4.33" + +postgres_ffi = { path = "../postgres_ffi" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs new file mode 100644 index 0000000000..76ca3c3377 --- /dev/null +++ b/pageserver/src/basebackup.rs @@ -0,0 +1,197 @@ +use log::*; +use tar::{Builder}; +use std::fmt; +use std::io::Write; +use walkdir::WalkDir; +use regex::Regex; + +use crate::ZTimelineId; + + +pub fn send_snapshot_tarball(write: &mut dyn Write, timelineid: ZTimelineId, snapshotlsn: u64) -> Result<(), std::io::Error> { + let mut ar = Builder::new(write); + + let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn); + let walpath = format!("timelines/{}/wal", timelineid); + + debug!("sending tarball of snapshot in {}", snappath); + //ar.append_dir_all("", &snappath)?; + + for entry in WalkDir::new(&snappath) { + let entry = entry?; + let fullpath = entry.path(); + let relpath = entry.path().strip_prefix(&snappath).unwrap(); + + if relpath.to_str().unwrap() == "" { + continue; + } + + if entry.file_type().is_dir() { + trace!("sending dir {} as {}", fullpath.display(), relpath.display()); + ar.append_dir(relpath, fullpath)?; + } else if entry.file_type().is_symlink() { + error!("ignoring symlink in snapshot dir"); + } else if entry.file_type().is_file() { + + // Shared catalogs are exempt + if relpath.starts_with("global/") { + trace!("sending shared catalog {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else if !is_rel_file_path(relpath.to_str().unwrap()) { + trace!("sending {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else { + trace!("not sending {}", relpath.display()); + // FIXME: send all files for now + ar.append_path_with_name(fullpath, relpath)?; + } + } else { + error!("unknown file type: {}", fullpath.display()); + } + } + + // FIXME: also send all the WAL + for entry in std::fs::read_dir(&walpath)? { + let entry = entry?; + let fullpath = &entry.path(); + let relpath = fullpath.strip_prefix(&walpath).unwrap(); + + if !entry.path().is_file() { + continue; + } + + let archive_fname = relpath.to_str().unwrap().clone(); + let archive_fname = archive_fname.strip_suffix(".partial").unwrap_or(&archive_fname); + let archive_path = "pg_wal/".to_owned() + archive_fname; + ar.append_path_with_name(fullpath, archive_path)?; + } + + ar.finish()?; + debug!("all tarred up!"); + Ok(()) +} + + +// formats: +// +// _ +// . +// _. + + +#[derive(Debug)] +struct FilePathError { + msg: String, +} + +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} + + +fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> { + /* + * Relation data files can be in one of the following directories: + * + * global/ + * shared relations + * + * base// + * regular relations, default tablespace + * + * pg_tblspc/// + * within a non-default tablespace (the name of the directory + * depends on version) + * + * And the relation data files themselves have a filename like: + * + * . + */ + if let Some(fname) = path.strip_prefix("global/") { + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(dbpath) = path.strip_prefix("base/") { + let mut s = dbpath.split("/"); + let dbnode_str = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + let _dbnode = u32::from_str_radix(dbnode_str, 10)?; + let fname = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + if s.next().is_some() { + return Err(FilePathError::new("invalid relation data file name")); + }; + + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(_) = path.strip_prefix("pg_tblspc/") { + // TODO + return Err(FilePathError::new("tablespaces not supported")); + } else { + return Err(FilePathError::new("invalid relation data file name")); + } +} + +fn is_rel_file_path(path: &str) -> bool { + return parse_rel_file_path(path).is_ok(); +} diff --git a/pageserver/src/bin/cli/main.rs b/pageserver/src/bin/cli/main.rs deleted file mode 100644 index 4aa3269c09..0000000000 --- a/pageserver/src/bin/cli/main.rs +++ /dev/null @@ -1,43 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -pub mod pg; -pub mod snapshot; -pub mod storage; -mod subcommand; - -fn main() -> Result<()> { - let cli_commands = subcommand::ClapCommands { - commands: vec![ - Box::new(pg::PgCmd { - clap_cmd: clap::SubCommand::with_name("pg"), - }), - Box::new(storage::StorageCmd { - clap_cmd: clap::SubCommand::with_name("storage"), - }), - Box::new(snapshot::SnapshotCmd { - clap_cmd: clap::SubCommand::with_name("snapshot"), - }), - ], - }; - - let matches = App::new("zenith") - .about("Zenith CLI") - .version("1.0") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommands(cli_commands.generate()) - .get_matches(); - - if let Some(subcommand) = matches.subcommand_name() { - println!("'git {}' was used", subcommand); - } - - match matches.subcommand() { - ("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?, - ("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?, - ("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?, - ("", None) => println!("No subcommand"), - _ => unreachable!(), - } - Ok(()) -} diff --git a/pageserver/src/bin/cli/pg.rs b/pageserver/src/bin/cli/pg.rs deleted file mode 100644 index 7fe2f86d6c..0000000000 --- a/pageserver/src/bin/cli/pg.rs +++ /dev/null @@ -1,105 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct PgCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for PgCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith compute nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list").about("List existing compute nodes")) - .subcommand( - App::new("create") - .about( - "Create (init) new data directory using given storage and start postgres", - ) - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("storage") - .short("s") - .long("storage") - .takes_value(true) - .help("Name of the storage node to use"), - ) - //TODO should it be just name of uploaded snapshot or some path? - .arg( - Arg::with_name("snapshot") - .long("snapshot") - .takes_value(true) - .help("Name of the snapshot to use"), - ) - .arg( - Arg::with_name("nostart") - .long("no-start") - .takes_value(false) - .help("Don't start postgres on the created node"), - ), - ) - .subcommand( - App::new("destroy") - .about("Stop postgres and destroy node's data directory") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("start") - .about("Start postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("replica") - .long("replica") - .takes_value(false) - .help("Start the compute node as replica"), - ), - ) - .subcommand( - App::new("stop") - .about("Stop postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("show") - .about("Show info about the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run PgCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/snapshot.rs b/pageserver/src/bin/cli/snapshot.rs deleted file mode 100644 index 47e608b8e2..0000000000 --- a/pageserver/src/bin/cli/snapshot.rs +++ /dev/null @@ -1,27 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct SnapshotCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for SnapshotCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith snapshots") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true))) - .subcommand(App::new("destroy")) - .subcommand(App::new("start")) - .subcommand(App::new("stop")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run SnapshotCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/storage.rs b/pageserver/src/bin/cli/storage.rs deleted file mode 100644 index 71ca61e905..0000000000 --- a/pageserver/src/bin/cli/storage.rs +++ /dev/null @@ -1,25 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -use crate::subcommand; - -pub struct StorageCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for StorageCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith storage nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("attach")) - .subcommand(App::new("detach")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run StorageCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/subcommand.rs b/pageserver/src/bin/cli/subcommand.rs deleted file mode 100644 index 6a9e7363b9..0000000000 --- a/pageserver/src/bin/cli/subcommand.rs +++ /dev/null @@ -1,29 +0,0 @@ -use anyhow::Result; - -/// All subcommands need to implement this interface. -pub trait SubCommand { - /// Generates the cli-config that Clap requires for the subcommand. - fn gen_clap_command(&self) -> clap::App; - - /// Runs the body of the subcommand. - fn run(&self, args: clap::ArgMatches) -> Result<()>; -} - -/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must -/// implement the `SubCommand` trait, but other than that, can be of any type. -pub struct ClapCommands { - pub commands: Vec>, -} - -impl ClapCommands { - /// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in - /// order to generate the full CLI. - pub fn generate(&self) -> Vec { - let mut v: Vec = Vec::new(); - - for command in self.commands.iter() { - v.push(command.gen_clap_command()); - } - v - } -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 0ef258ad6c..b98cca4ca1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -5,10 +5,9 @@ use log::*; use std::fs; use std::io; -use std::path::PathBuf; use std::process::exit; use std::thread; -use std::{fs::File, fs::OpenOptions}; +use std::fs::{File, OpenOptions}; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -17,25 +16,21 @@ use daemonize::Daemonize; use slog::Drain; use pageserver::page_service; -use pageserver::restore_datadir; -use pageserver::restore_s3; use pageserver::tui; -use pageserver::walreceiver; +//use pageserver::walreceiver; use pageserver::PageServerConf; +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") - .arg(Arg::with_name("datadir") - .short("D") - .long("dir") - .takes_value(true) - .help("Path to the page server data directory")) - .arg(Arg::with_name("wal_producer") - .short("w") - .long("wal-producer") - .takes_value(true) - .help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')")) .arg(Arg::with_name("listen") .short("l") .long("listen") @@ -51,25 +46,14 @@ fn main() -> Result<()> { .long("daemonize") .takes_value(false) .help("Run in the background")) - .arg(Arg::with_name("restore_from") - .long("restore-from") - .takes_value(true) - .help("Upload data from s3 or datadir")) .get_matches(); let mut conf = PageServerConf { - data_dir: PathBuf::from("./"), daemonize: false, interactive: false, - wal_producer_connstr: None, - listen_addr: "127.0.0.1:5430".parse().unwrap(), - restore_from: String::new(), + listen_addr: "127.0.0.1:5430".parse().unwrap() }; - if let Some(dir) = arg_matches.value_of("datadir") { - conf.data_dir = PathBuf::from(dir); - } - if arg_matches.is_present("daemonize") { conf.daemonize = true; } @@ -83,14 +67,6 @@ fn main() -> Result<()> { exit(1); } - if let Some(restore_from) = arg_matches.value_of("restore_from") { - conf.restore_from = String::from(restore_from); - } - - if let Some(addr) = arg_matches.value_of("wal_producer") { - conf.wal_producer_connstr = Some(String::from(addr)); - } - if let Some(addr) = arg_matches.value_of("listen") { conf.listen_addr = addr.parse()?; } @@ -125,19 +101,25 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { if conf.daemonize { info!("daemonizing..."); - // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so + let repodir = zenith_repo_dir(); + + // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fprintf's or backtraces. - let log_filename = conf.data_dir.join("pageserver.log"); + let log_filename = repodir.clone() + "pageserver.log"; let stdout = OpenOptions::new() .create(true) .append(true) .open(&log_filename) - .with_context(|| format!("failed to open {:?}", log_filename))?; - let stderr = stdout.try_clone()?; + .with_context(|| format!("failed to open {:?}", &log_filename))?; + let stderr = OpenOptions::new() + .create(true) + .append(true) + .open(&log_filename) + .with_context(|| format!("failed to open {:?}", &log_filename))?; let daemonize = Daemonize::new() - .pid_file(conf.data_dir.join("pageserver.pid")) - .working_directory(conf.data_dir.clone()) + .pid_file(repodir.clone() + "/pageserver.pid") + .working_directory(repodir) .stdout(stdout) .stderr(stderr); @@ -146,24 +128,21 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { Err(e) => error!("Error, {}", e), } } + else + { + // change into the repository directory. In daemon mode, Daemonize + // does this for us. + let repodir = zenith_repo_dir(); + std::env::set_current_dir(&repodir)?; + info!("Changed current directory to repository in {}", &repodir); + } let mut threads = Vec::new(); - info!("starting... {}", conf.restore_from); - - // Before opening up for connections, restore the latest base backup from S3. - // (We don't persist anything to local disk at the moment, so we need to do - // this at every startup) - if conf.restore_from.eq("s3") { - info!("restore-from s3..."); - restore_s3::restore_main(&conf); - } else if conf.restore_from.eq("local") { - info!("restore-from local..."); - restore_datadir::restore_main(&conf); - } + // TODO: Check that it looks like a valid repository before going further // Create directory for wal-redo datadirs - match fs::create_dir(conf.data_dir.join("wal-redo")) { + match fs::create_dir("wal-redo") { Ok(_) => {} Err(e) => match e.kind() { io::ErrorKind::AlreadyExists => {} @@ -173,25 +152,6 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { }, } - // Launch the WAL receiver thread if pageserver was started with --wal-producer - // option. It will try to connect to the WAL safekeeper, and stream the WAL. If - // the connection is lost, it will reconnect on its own. We just fire and forget - // it here. - // - // All other wal receivers are started on demand by "callmemaybe" command - // sent to pageserver. - if let Some(wal_producer) = &conf.wal_producer_connstr { - let conf_copy = conf.clone(); - let wal_producer = wal_producer.clone(); - let walreceiver_thread = thread::Builder::new() - .name("static WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &wal_producer); - }) - .unwrap(); - threads.push(walreceiver_thread); - } - // GetPage@LSN requests are served by another thread. (It uses async I/O, // but the code in page_service sets up it own thread pool for that) let conf_copy = conf.clone(); @@ -220,7 +180,7 @@ fn init_logging(conf: &PageServerConf) -> Result Result, pub listen_addr: SocketAddr, - pub restore_from: String, +} + +// Zenith Timeline ID is a 32-byte random ID. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct ZTimelineId([u8; 16]); + +impl ZTimelineId { + + pub fn from_str(s: &str) -> Result { + let timelineid = hex::decode(s)?; + + let mut buf: [u8; 16] = [0u8; 16]; + buf.copy_from_slice(timelineid.as_slice()); + Ok(ZTimelineId(buf)) + } + + pub fn from(b: [u8; 16]) -> ZTimelineId { + ZTimelineId(b) + } + + pub fn to_str(self: &ZTimelineId) -> String { + hex::encode(self.0) + } +} + +impl std::fmt::Display for ZTimelineId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.to_str()) + } } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index db0a33b55b..20b3460d8c 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -7,6 +7,8 @@ // use crate::{walredo, PageServerConf}; +use crate::restore_local_repo::restore_timeline; +use crate::ZTimelineId; use anyhow::bail; use bytes::Bytes; use core::ops::Bound::Included; @@ -107,30 +109,49 @@ struct PageCacheShared { } lazy_static! { - pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); + pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); } -pub fn get_pagecache(conf: &PageServerConf, sys_id: u64) -> Arc { +// Get Page Cache for given timeline. It is assumed to already exist. +pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option> { + let pcaches = PAGECACHES.lock().unwrap(); + + match pcaches.get(&timelineid) { + Some(pcache) => Some(pcache.clone()), + None => None + } +} + +pub fn get_or_restore_pagecache(conf: &PageServerConf, timelineid: ZTimelineId) -> anyhow::Result> { let mut pcaches = PAGECACHES.lock().unwrap(); - if !pcaches.contains_key(&sys_id) { - pcaches.insert(sys_id, Arc::new(init_page_cache())); + match pcaches.get(&timelineid) { + Some(pcache) => Ok(pcache.clone()), + None => { + let pcache = init_page_cache(); - // Initialize the WAL redo thread - // - // Now join_handle is not saved any where and we won'try restart tharead - // if it is dead. We may later stop that treads after some inactivity period - // and restart them on demand. - let conf = conf.clone(); - let _walredo_thread = thread::Builder::new() - .name("WAL redo thread".into()) - .spawn(move || { - walredo::wal_redo_main(&conf, sys_id); - }) - .unwrap(); + restore_timeline(conf, &pcache, timelineid)?; + + let result = Arc::new(pcache); + + pcaches.insert(timelineid, result.clone()); + + // Initialize the WAL redo thread + // + // Now join_handle is not saved any where and we won'try restart tharead + // if it is dead. We may later stop that treads after some inactivity period + // and restart them on demand. + let conf_copy = conf.clone(); + let _walredo_thread = thread::Builder::new() + .name("WAL redo thread".into()) + .spawn(move || { + walredo::wal_redo_main(&conf_copy, timelineid); + }) + .unwrap(); + + return Ok(result); + } } - - pcaches.get(&sys_id).unwrap().clone() } fn init_page_cache() -> PageCache { @@ -429,7 +450,8 @@ impl PageCache { // Adds a WAL record to the page cache // pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) { - let key = CacheKey { tag, lsn: rec.lsn }; + let lsn = rec.lsn; + let key = CacheKey { tag, lsn }; let entry = CacheEntry::new(key.clone()); entry.content.lock().unwrap().wal_record = Some(rec); @@ -447,13 +469,14 @@ impl PageCache { *rel_entry = tag.blknum + 1; } - trace!("put_wal_record lsn: {}", key.lsn); + //trace!("put_wal_record lsn: {}", lsn); let oldentry = shared.pagecache.insert(key, Arc::new(entry)); self.num_entries.fetch_add(1, Ordering::Relaxed); if !oldentry.is_none() { - error!("overwriting WAL record in page cache"); + error!("overwriting WAL record with LSN {:X}/{:X} in page cache", + lsn >> 32, lsn & 0xffffffff); } self.num_wal_records.fetch_add(1, Ordering::Relaxed); @@ -486,12 +509,17 @@ impl PageCache { let mut shared = self.shared.lock().unwrap(); // Can't move backwards. - assert!(lsn >= shared.last_valid_lsn); + let oldlsn = shared.last_valid_lsn; + if lsn >= oldlsn { - shared.last_valid_lsn = lsn; - self.valid_lsn_condvar.notify_all(); + shared.last_valid_lsn = lsn; + self.valid_lsn_condvar.notify_all(); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } else { + warn!("attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})", + oldlsn >> 32, oldlsn & 0xffffffff, lsn >> 32, lsn & 0xffffffff); + } } // diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9ff0b2cf46..cc972f713e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,29 +7,43 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url +// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use byteorder::{BigEndian, ByteOrder}; -use bytes::{Buf, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; use log::*; use std::io; use std::thread; +use std::sync::Arc; +use regex::Regex; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; use tokio::net::{TcpListener, TcpStream}; use tokio::runtime; +use tokio::runtime::Runtime; use tokio::task; +use tokio::sync::mpsc; use crate::page_cache; +use crate::restore_local_repo; +use crate::basebackup; use crate::walreceiver; use crate::PageServerConf; +use crate::ZTimelineId; + type Result = std::result::Result; #[derive(Debug)] enum FeMessage { StartupMessage(FeStartupMessage), - Query(FeQueryMessage), + Query(FeQueryMessage), // Simple query + Parse(FeParseMessage), // Extended query protocol + Describe(FeDescribeMessage), + Bind(FeBindMessage), + Execute(FeExecuteMessage), + Close(FeCloseMessage), + Sync, Terminate, // @@ -49,6 +63,11 @@ enum BeMessage { AuthenticationOk, ReadyForQuery, RowDescription, + ParseComplete, + ParameterDescription, + NoData, + BindComplete, + CloseComplete, DataRow, CommandComplete, ControlFile, @@ -145,6 +164,180 @@ struct FeQueryMessage { body: Bytes, } +// We only support the simple case of Parse on unnamed prepared statement and +// no params +#[derive(Debug)] +struct FeParseMessage { + query_string: Bytes, +} + +fn read_null_terminated(buf: &mut Bytes) -> Result +{ + let mut result = BytesMut::new(); + + loop { + if !buf.has_remaining() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "no null-terminator in string", + )); + } + + let byte = buf.get_u8(); + + if byte == 0 { + break; + } + result.put_u8(byte); + } + return Ok(result.freeze()); +} + +impl FeParseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _pstmt_name = read_null_terminated(&mut buf)?; + let query_string = read_null_terminated(&mut buf)?; + let nparams = buf.get_i16(); + + // FIXME: the rust-postgres driver uses a named prepared statement + // for copy_out(). We're not prepared to handle that correctly. For + // now, just ignore the statement name, assuming that the client never + // uses more than one prepared statement at a time. + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Parse", + )); + } + */ + + if nparams != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "query params not implemented", + )); + } + + + Ok(FeMessage::Parse(FeParseMessage {query_string})) + } +} + +#[derive(Debug)] +struct FeDescribeMessage { + kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. + // we only support unnamed prepared stmt or portal +} + +impl FeDescribeMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let kind = buf.get_u8(); + let _pstmt_name = read_null_terminated(&mut buf)?; + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Describe", + )); + } + */ + + if kind != 0x53 { // 'S' + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "only prepared statmement Describe is implemented", + )); + } + + + Ok(FeMessage::Describe(FeDescribeMessage {kind})) + } +} + +// we only support unnamed prepared stmt or portal +#[derive(Debug)] +struct FeExecuteMessage { + maxrows: i32// max # of rows +} + +impl FeExecuteMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let maxrows = buf.get_i32(); + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + if maxrows != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "row limit in Execute message not supported", + )); + } + + Ok(FeMessage::Execute(FeExecuteMessage {maxrows})) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeBindMessage { +} + +impl FeBindMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_null_terminated(&mut buf)?; + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented", + )); + } + */ + + Ok(FeMessage::Bind(FeBindMessage {})) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeCloseMessage { +} + +impl FeCloseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _kind = buf.get_u8(); + let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + + // FIXME: we do nothing with Close + + Ok(FeMessage::Close(FeCloseMessage {})) + } +} + impl FeMessage { pub fn parse(buf: &mut BytesMut) -> Result> { if buf.len() < 5 { @@ -173,10 +366,18 @@ impl FeMessage { let mut body = buf.split_to(total_len); body.advance(5); + let mut body = body.freeze(); + match tag { b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { - body: body.freeze(), + body: body, }))), + b'P' => Ok(Some(FeParseMessage::parse(body)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), + b'B' => Ok(Some(FeBindMessage::parse(body)?)), + b'C' => Ok(Some(FeCloseMessage::parse(body)?)), + b'S' => Ok(Some(FeMessage::Sync)), b'X' => Ok(Some(FeMessage::Terminate)), b'd' => { let smgr_tag = body.get_u8(); @@ -228,13 +429,15 @@ pub fn thread_main(conf: &PageServerConf) { info!("Starting page server on {}", conf.listen_addr); - runtime.block_on(async { + let runtime_ref = Arc::new(runtime); + + runtime_ref.clone().block_on(async { let listener = TcpListener::bind(conf.listen_addr).await.unwrap(); loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); - let mut conn_handler = Connection::new(conf.clone(), socket); + let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref); task::spawn(async move { if let Err(err) = conn_handler.run().await { @@ -251,15 +454,17 @@ struct Connection { buffer: BytesMut, init_done: bool, conf: PageServerConf, + runtime: Arc, } impl Connection { - pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection { + pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc) -> Connection { Connection { stream: BufWriter::new(socket), buffer: BytesMut::with_capacity(10 * 1024), init_done: false, conf, + runtime: runtime.clone(), } } @@ -307,6 +512,33 @@ impl Connection { self.stream.write_u8(b'I').await?; } + BeMessage::ParseComplete => { + self.stream.write_u8(b'1').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::BindComplete => { + self.stream.write_u8(b'2').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::CloseComplete => { + self.stream.write_u8(b'3').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::NoData => { + self.stream.write_u8(b'n').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::ParameterDescription => { + self.stream.write_u8(b't').await?; + self.stream.write_i32(6).await?; + // we don't support params, so always 0 + self.stream.write_i16(0).await?; + } + BeMessage::RowDescription => { // XXX let mut b = Bytes::from("data\0"); @@ -396,8 +628,12 @@ impl Connection { } async fn run(&mut self) -> Result<()> { + + let mut unnamed_query_string = Bytes::new(); loop { - match self.read_message().await? { + let msg = self.read_message().await?; + info!("got message {:?}", msg); + match msg { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); @@ -417,7 +653,27 @@ impl Connection { } } Some(FeMessage::Query(m)) => { - self.process_query(&m).await?; + self.process_query(m.body).await?; + } + Some(FeMessage::Parse(m)) => { + unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete).await?; + } + Some(FeMessage::Describe(_)) => { + self.write_message_noflush(&BeMessage::ParameterDescription).await?; + self.write_message(&BeMessage::NoData).await?; + } + Some(FeMessage::Bind(_)) => { + self.write_message(&BeMessage::BindComplete).await?; + } + Some(FeMessage::Close(_)) => { + self.write_message(&BeMessage::CloseComplete).await?; + } + Some(FeMessage::Execute(_)) => { + self.process_query(unnamed_query_string.clone()).await?; + } + Some(FeMessage::Sync) => { + self.write_message(&BeMessage::ReadyForQuery).await?; } Some(FeMessage::Terminate) => { break; @@ -426,7 +682,8 @@ impl Connection { info!("connection closed"); break; } - _ => { + x => { + error!("unexpected message type : {:?}", x); return Err(io::Error::new(io::ErrorKind::Other, "unexpected message")); } } @@ -435,41 +692,59 @@ impl Connection { Ok(()) } - async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> { - trace!("got query {:?}", q.body); + async fn process_query(&mut self, query_string: Bytes) -> Result<()> { + debug!("process query {:?}", query_string); - if q.body.starts_with(b"controlfile") { + // remove null terminator, if any + let mut query_string = query_string.clone(); + if query_string.last() == Some(&0) { + query_string.truncate(query_string.len() - 1); + } + + if query_string.starts_with(b"controlfile") { self.handle_controlfile().await - } else if q.body.starts_with(b"pagestream ") { - let (_l, r) = q.body.split_at("pagestream ".len()); - let mut r = r.to_vec(); - r.pop(); - let sysid = String::from_utf8(r).unwrap().trim().to_string(); - let sysid: u64 = sysid.parse().unwrap(); // XXX + } else if query_string.starts_with(b"pagestream ") { + let (_l, r) = query_string.split_at("pagestream ".len()); + let timelineid_str = String::from_utf8(r.to_vec()).unwrap(); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - self.handle_pagerequests(sysid).await - } else if q.body.starts_with(b"callmemaybe ") { - let (_l, r) = q.body.split_at("callmemaybe ".len()); - let mut r = r.to_vec(); - r.pop(); - let connstr = String::from_utf8(r).unwrap().trim().to_string(); + self.handle_pagerequests(timelineid).await + } else if query_string.starts_with(b"basebackup ") { + let (_l, r) = query_string.split_at("basebackup ".len()); + let r = r.to_vec(); + let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end()); + info!("got basebackup command: \"{}\"", timelineid_str); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - let conf_copy = self.conf.clone(); - let _walreceiver_thread = thread::Builder::new() - .name("WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &connstr); - }) - .unwrap(); + // Check that the timeline exists + self.handle_basebackup_request(timelineid).await?; + self.write_message_noflush(&BeMessage::CommandComplete).await?; + self.write_message(&BeMessage::ReadyForQuery).await + } else if query_string.starts_with(b"callmemaybe ") { + let query_str = String::from_utf8(query_string.to_vec()).unwrap().to_string(); + + // callmemaybe + let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap(); + let caps = re.captures(&query_str); + let caps = caps.unwrap(); + + let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap(); + let connstr: String = String::from(caps.get(2).unwrap().as_str()); + + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid))); + } + + walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr); - // generic ack: - self.write_message_noflush(&BeMessage::RowDescription) - .await?; - self.write_message_noflush(&BeMessage::DataRow).await?; self.write_message_noflush(&BeMessage::CommandComplete) .await?; self.write_message(&BeMessage::ReadyForQuery).await - } else if q.body.starts_with(b"status") { + } else if query_string.starts_with(b"status") { self.write_message_noflush(&BeMessage::RowDescription) .await?; self.write_message_noflush(&BeMessage::DataRow).await?; @@ -495,7 +770,17 @@ impl Connection { self.write_message(&BeMessage::ReadyForQuery).await } - async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> { + async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> { + + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested pagestream on timeline {} which does not exist in page server", timelineid))); + } + let pcache = pcache.unwrap(); + /* switch client to COPYBOTH */ self.stream.write_u8(b'W').await?; self.stream.write_i32(4 + 1 + 2).await?; @@ -503,13 +788,11 @@ impl Connection { self.stream.write_i16(0).await?; /* numAttributes */ self.stream.flush().await?; - let pcache = page_cache::get_pagecache(&self.conf, sysid); - loop { let message = self.read_message().await?; if let Some(m) = &message { - info!("query({}): {:?}", sysid, m); + info!("query({:?}): {:?}", timelineid, m); }; if message.is_none() { @@ -628,4 +911,102 @@ impl Connection { } } } + + async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> { + // check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested basebackup on timeline {} which does not exist in page server", timelineid))); + } + + /* switch client to COPYOUT */ + let stream = &mut self.stream; + stream.write_u8(b'H').await?; + stream.write_i32(4 + 1 + 2).await?; + stream.write_u8(0).await?; /* copy_is_binary */ + stream.write_i16(0).await?; /* numAttributes */ + stream.flush().await?; + info!("sent CopyOut"); + + /* Send a tarball of the latest snapshot on the timeline */ + + // find latest snapshot + let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap(); + + // Stream it + let (s, mut r) = mpsc::channel(5); + + let f_tar = task::spawn_blocking(move || { + basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?; + Ok(()) + }); + let f_tar2 = async { + let joinres = f_tar.await; + + if joinres.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + joinres.unwrap_err() + )); + } + return joinres.unwrap(); + }; + + let f_pump = async move { + loop { + let buf = r.recv().await; + if buf.is_none() { + break; + } + let mut buf = buf.unwrap(); + + // CopyData + stream.write_u8(b'd').await?; + stream.write_u32((4 + buf.len()) as u32).await?; + stream.write_all(&mut buf).await?; + trace!("CopyData sent for {} bytes!", buf.len()); + + // FIXME: flush isn't really required, but makes it easier + // to view in wireshark + stream.flush().await?; + } + Ok(()) + }; + + tokio::try_join!(f_tar2, f_pump)?; + + // CopyDone + self.stream.write_u8(b'c').await?; + self.stream.write_u32(4).await?; + self.stream.flush().await?; + debug!("CopyDone sent!"); + + // FIXME: I'm getting an error from the tokio copyout driver without this. + // I think it happens when the CommandComplete, CloseComplete and ReadyForQuery + // are sent in the same TCP packet as the CopyDone. I don't understand why. + thread::sleep(std::time::Duration::from_secs(1)); + + Ok(()) + } +} + +struct CopyDataSink(mpsc::Sender); + +impl std::io::Write for CopyDataSink { + fn write(&mut self, data: &[u8]) -> std::result::Result { + + let buf = Bytes::copy_from_slice(data); + + if let Err(e) = self.0.blocking_send(buf) { + return Err(io::Error::new(io::ErrorKind::Other, e)); + } + + Ok(data.len()) + } + fn flush(&mut self) -> std::result::Result<(), std::io::Error> { + // no-op + Ok(()) + } } diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs deleted file mode 100644 index 3b4f303bbc..0000000000 --- a/pageserver/src/restore_datadir.rs +++ /dev/null @@ -1,339 +0,0 @@ -// -// Restore chunks from S3 -// -// This runs once at Page Server startup. It loads all the "base images" from -// S3 into the in-memory page cache. It also initializes the "last valid LSN" -// in the page cache to the LSN of the base image, so that when the WAL receiver -// is started, it starts streaming from that LSN. -// - -use bytes::{Buf, BytesMut}; -use log::*; -use regex::Regex; -use std::env; -use std::fmt; - -use tokio::runtime; - -use futures::future; - -use crate::{page_cache, pg_constants, PageServerConf}; -use std::fs; -use walkdir::WalkDir; - -pub fn restore_main(conf: &PageServerConf) { - // Create a new thread pool - let runtime = runtime::Runtime::new().unwrap(); - - runtime.block_on(async { - let result = restore_chunk(conf).await; - - match result { - Ok(_) => { - return; - } - Err(err) => { - error!("error: {}", err); - return; - } - } - }); -} - -async fn restore_chunk(conf: &PageServerConf) -> Result<(), FilePathError> { - let pgdata_base_path = env::var("PGDATA_BASE_PATH").unwrap(); - info!("Restoring from local dir..."); - - let sys_id: u64 = 42; - let control_lsn = 0; //TODO get it from sysid - let mut slurp_futures: Vec<_> = Vec::new(); - - for e in WalkDir::new(pgdata_base_path.clone()) { - let entry = e.unwrap(); - - if !entry.path().is_dir() { - let path = entry.path().to_str().unwrap(); - - let relpath = path - .strip_prefix(&format!("{}/", pgdata_base_path)) - .unwrap(); - info!( - "Restoring file {} relpath {}", - entry.path().display(), - relpath - ); - - let parsed = parse_rel_file_path(&relpath); - - match parsed { - Ok(mut p) => { - p.lsn = control_lsn; - - let f = slurp_base_file(conf, sys_id, path.to_string(), p); - - slurp_futures.push(f); - } - Err(e) => { - warn!("unrecognized file: {} ({})", relpath, e); - } - }; - } - } - - let pcache = page_cache::get_pagecache(conf, sys_id); - pcache.init_valid_lsn(control_lsn); - - info!("{} files to restore...", slurp_futures.len()); - - future::join_all(slurp_futures).await; - info!("restored!"); - Ok(()) -} - -#[derive(Debug)] -struct FilePathError { - msg: String, -} - -impl FilePathError { - fn new(msg: &str) -> FilePathError { - FilePathError { - msg: msg.to_string(), - } - } -} - -impl From for FilePathError { - fn from(e: core::num::ParseIntError) -> Self { - return FilePathError { - msg: format!("invalid filename: {}", e), - }; - } -} - -impl fmt::Display for FilePathError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "invalid filename") - } -} - -fn forkname_to_forknum(forkname: Option<&str>) -> Result { - match forkname { - // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(0), - Some("fsm") => Ok(1), - Some("vm") => Ok(2), - Some("init") => Ok(3), - Some(_) => Err(FilePathError::new("invalid forkname")), - } -} - -#[derive(Debug)] -struct ParsedBaseImageFileName { - pub spcnode: u32, - pub dbnode: u32, - pub relnode: u32, - pub forknum: u32, - pub segno: u32, - - pub lsn: u64, -} - -// formats: -// -// _ -// . -// _. -fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> { - let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); - - let caps = re - .captures(fname) - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - - let relnode_str = caps.name("relnode").unwrap().as_str(); - let relnode = u32::from_str_radix(relnode_str, 10)?; - - let forkname_match = caps.name("forkname"); - let forkname = if forkname_match.is_none() { - None - } else { - Some(forkname_match.unwrap().as_str()) - }; - let forknum = forkname_to_forknum(forkname)?; - - let segno_match = caps.name("segno"); - let segno = if segno_match.is_none() { - 0 - } else { - u32::from_str_radix(segno_match.unwrap().as_str(), 10)? - }; - return Ok((relnode, forknum, segno, 0)); -} - -fn parse_rel_file_path(path: &str) -> Result { - /* - * Relation data files can be in one of the following directories: - * - * global/ - * shared relations - * - * base// - * regular relations, default tablespace - * - * pg_tblspc/// - * within a non-default tablespace (the name of the directory - * depends on version) - * - * And the relation data files themselves have a filename like: - * - * . - */ - if let Some(fname) = path.strip_prefix("global/") { - if fname.contains("pg_control") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_CONTROLFILE_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(dbpath) = path.strip_prefix("base/") { - let mut s = dbpath.split("/"); - let dbnode_str = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - let dbnode = u32::from_str_radix(dbnode_str, 10)?; - let fname = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - if s.next().is_some() { - return Err(FilePathError::new("invalid relation data file name")); - }; - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dbnode, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(fname) = path.strip_prefix("pg_xact/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_XACT_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/members/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/offsets/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(_) = path.strip_prefix("pg_tblspc/") { - // TODO - return Err(FilePathError::new("tablespaces not supported")); - } else { - return Err(FilePathError::new("invalid relation data file name")); - } -} - -async fn slurp_base_file( - conf: &PageServerConf, - sys_id: u64, - file_path: String, - parsed: ParsedBaseImageFileName, -) { - info!("slurp_base_file local path {}", file_path); - - let mut data = fs::read(file_path).unwrap(); - - // pg_filenode.map has non-standard size - 512 bytes - // enlarge it to treat as a regular page - if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM { - data.resize(8192, 0); - } - - let data_bytes: &[u8] = &data; - let mut bytes = BytesMut::from(data_bytes).freeze(); - - // FIXME: use constants (BLCKSZ) - let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192); - - let pcache = page_cache::get_pagecache(conf, sys_id); - - let reltag = page_cache::RelTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - }; - - while bytes.remaining() >= 8192 { - let tag = page_cache::BufferTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - blknum: blknum, - }; - - pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); - - pcache.relsize_inc(&reltag, blknum + 1); - blknum += 1; - } -} diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs new file mode 100644 index 0000000000..c53c04ef92 --- /dev/null +++ b/pageserver/src/restore_local_repo.rs @@ -0,0 +1,460 @@ +// +// Restore chunks from local Zenith repository +// +// This runs once at Page Server startup. It loads all the "snapshots" and all +// WAL from all timelines from the local zenith repository into the in-memory page +// cache. +// +// This also initializes the "last valid LSN" in the page cache to the last LSN +// seen in the WAL, so that when the WAL receiver is started, it starts +// streaming from that LSN. +// + +use log::*; +use regex::Regex; +use std::fmt; + +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::Read; +use std::io::Seek; +use std::io::SeekFrom; +use std::path::{Path, PathBuf}; +use std::cmp::max; + +use anyhow::Result; +use bytes::Bytes; + +use crate::page_cache; +use crate::page_cache::PageCache; +use crate:: PageServerConf; +use crate::page_cache::BufferTag; +use crate::waldecoder::WalStreamDecoder; +use crate::ZTimelineId; + + +// From pg_tablespace_d.h +// +// FIXME: we'll probably need these elsewhere too, move to some common location +const DEFAULTTABLESPACE_OID: u32 = 1663; +const GLOBALTABLESPACE_OID: u32 = 1664; + +// +// Load it all into the page cache. +// +pub fn restore_timeline(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId) -> Result<()> { + + let timelinepath = PathBuf::from("timelines").join(&timeline.to_str()); + + if !timelinepath.exists() { + anyhow::bail!("timeline {} does not exist in the page server's repository"); + } + + // Scan .zenith/timelines//snapshots + let snapshotspath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots"; + + let mut last_snapshot_lsn: u64 = 0; + + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + + restore_snapshot(conf, pcache, timeline, &filename)?; + info!("restored snapshot at {}", filename); + } + + if last_snapshot_lsn == 0 { + error!("could not find valid snapshot in {}", &snapshotspath); + // TODO return error? + } + pcache.init_valid_lsn(last_snapshot_lsn); + + restore_wal(conf, pcache, timeline, last_snapshot_lsn)?; + + Ok(()) +} + +pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result { + + let snapshotspath = format!("timelines/{}/snapshots", timeline); + + let mut last_snapshot_lsn = 0; + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + } + + if last_snapshot_lsn == 0 { + error!("could not find valid snapshot in {}", &snapshotspath); + // TODO return error? + } + Ok(last_snapshot_lsn) +} + +fn restore_snapshot(conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, snapshot: &str) -> Result<()> { + + let snapshotpath = "timelines/".to_owned() + &timeline.to_str() + "/snapshots/" + snapshot; + + // Scan 'global' + let paths = fs::read_dir(snapshotpath.clone() + "/global").unwrap(); + + for direntry in paths { + let path = direntry.unwrap().path(); + let filename = path.file_name(); + if filename.is_none() { + continue; + } + let filename = filename.unwrap().to_str(); + + if filename == Some("pg_control") { + continue; + } + if filename == Some("pg_filenode.map") { + continue; + } + + restore_relfile(conf, pcache, timeline, snapshot, GLOBALTABLESPACE_OID, 0, &path)?; + } + + // Scan 'base' + let paths = fs::read_dir(snapshotpath.clone() + "/base").unwrap(); + for path in paths { + let path = path.unwrap(); + let filename = path.file_name().to_str().unwrap().to_owned(); + + // Scan database dirs + let dboid = u32::from_str_radix(&filename, 10)?; + + let paths = fs::read_dir(path.path()).unwrap(); + for direntry in paths { + let path = direntry.unwrap().path(); + let filename = path.file_name(); + if filename.is_none() { + continue; + } + let filename = filename.unwrap().to_str(); + if filename == Some("PG_VERSION") { + continue; + } + if filename == Some("pg_filenode.map") { + continue; + } + + restore_relfile(conf, pcache, timeline, snapshot, DEFAULTTABLESPACE_OID, dboid, &path)?; + } + } + + // TODO: Scan pg_tblspc + + Ok(()) +} + +fn restore_relfile(_conf: &PageServerConf, pcache: &PageCache, _timeline: ZTimelineId, snapshot: &str, spcoid: u32, dboid: u32, path: &Path) -> Result<()> { + + let lsn = u64::from_str_radix(snapshot, 16)?; + + // Does it look like a relation file? + + let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); + if p.is_err() { + let e = p.unwrap_err(); + warn!("unrecognized file in snapshot: {:?} ({})", path, e); + return Err(e)?; + } + let (relnode, forknum, segno) = p.unwrap(); + + let mut file = File::open(path)?; + let mut buf: [u8; 8192] = [0u8; 8192]; + + // FIXME: use constants (BLCKSZ) + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192); + loop { + let r = file.read_exact(&mut buf); + match r { + Ok(_) => { + let tag = page_cache::BufferTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + blknum: blknum, + }; + pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf)); + /* + if oldest_lsn == 0 || p.lsn < oldest_lsn { + oldest_lsn = p.lsn; + } + */ + } + + // TODO: UnexpectedEof is expected + Err(e) => match e.kind() { + std::io::ErrorKind::UnexpectedEof => { + // reached EOF. That's expected. + // FIXME: maybe check that we read the full length of the file? + break; + }, + _ => { + error!("error reading file: {:?} ({})", path, e); + break; + } + } + }; + blknum += 1; + } + + let tag = page_cache::RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + }; + pcache.relsize_inc(&tag, Some(blknum)); + + Ok(()) +} + +// Scan WAL on a timeline, starting from gien LSN, and load all the records +// into the page cache. +fn restore_wal(_conf: &PageServerConf, pcache: &PageCache, timeline: ZTimelineId, startpoint: u64) -> Result<()> { + let walpath = format!("timelines/{}/wal", timeline); + + let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint)); + + let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024); + let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024); + let mut last_lsn = 0; + loop { + // FIXME: assume postgresql tli 1 for now + let filename = XLogFileName(1, segno, 16 * 1024 * 1024); + let mut path = walpath.clone() + "/" + &filename; + + // It could be as .partial + if !PathBuf::from(&path).exists() { + path = path + ".partial"; + } + + // Slurp the WAL file + let open_result = File::open(&path); + if let Err(e) = open_result { + if e.kind() == std::io::ErrorKind::NotFound { + break; + } + return Err(e)?; + } + let mut file = open_result.unwrap(); + + if offset > 0 { + file.seek(SeekFrom::Start(offset as u64))?; + } + + let mut buf = Vec::new(); + let nread = file.read_to_end(&mut buf)?; + if nread != 16 * 1024 * 1024 - offset as usize { + // Maybe allow this for .partial files? + error!("read only {} bytes from WAL file", nread); + } + waldecoder.feed_bytes(&buf); + + let mut nrecords = 0; + loop { + let rec = waldecoder.poll_decode(); + if rec.is_err() { + // Assume that an error means we've reached the end of + // a partial WAL record. So that's ok. + break; + } + if let Some((lsn, recdata)) = rec.unwrap() { + let decoded = + crate::waldecoder::decode_wal_record(recdata.clone()); + + // Put the WAL record to the page cache. We make a separate copy of + // it for every block it modifies. (The actual WAL record is kept in + // a Bytes, which uses a reference counter for the underlying buffer, + // so having multiple copies of it doesn't cost that much) + for blk in decoded.blocks.iter() { + let tag = BufferTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum as u8, + blknum: blk.blkno, + }; + + let rec = page_cache::WALRecord { + lsn: lsn, + will_init: blk.will_init || blk.apply_image, + rec: recdata.clone(), + }; + + pcache.put_wal_record(tag, rec); + } + + // Now that this record has been handled, let the page cache know that + // it is up-to-date to this LSN + pcache.advance_last_valid_lsn(lsn); + last_lsn = lsn; + } else { + break; + } + nrecords += 1; + } + + info!("restored {} records from WAL file {}", nrecords, filename); + + segno += 1; + offset = 0; + } + info!("reached end of WAL at {:X}/{:X}", last_lsn >> 32, last_lsn & 0xffffffff); + + Ok(()) +} + +// FIXME: copied from xlog_utils.rs +pub const XLOG_FNAME_LEN: usize = 24; +pub type XLogRecPtr = u64; +pub type XLogSegNo = u64; +pub type TimeLineID = u32; + +#[allow(non_snake_case)] +pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + +#[allow(non_snake_case)] +pub fn IsXLogFileName(fname: &str) -> bool { + return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); +} + +#[allow(non_snake_case)] +pub fn IsPartialXLogFileName(fname: &str) -> bool { + if let Some(basefname) = fname.strip_suffix(".partial") { + IsXLogFileName(basefname) + } else { + false + } +} + + +#[derive(Debug, Clone)] +struct FilePathError { + msg: String, +} + +impl Error for FilePathError { + fn description(&self) -> &str { + &self.msg + } +} +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +#[derive(Debug)] +struct ParsedBaseImageFileName { + pub spcnode: u32, + pub dbnode: u32, + pub relnode: u32, + pub forknum: u32, + pub segno: u32, + + pub lsn: u64, +} + +// formats: +// +// _ +// . +// _. + +fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} + diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 957a103f4d..d8ec810f36 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -1,12 +1,8 @@ -//#![allow(non_upper_case_globals)] -//#![allow(non_camel_case_types)] -//#![allow(non_snake_case)] -//#![allow(dead_code)] -//include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - use bytes::{Buf, BufMut, Bytes, BytesMut}; use std::cmp::min; +use std::error::Error; +use std::fmt; use log::*; @@ -19,7 +15,7 @@ const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024; #[repr(C)] #[derive(Debug)] -struct XLogPageHeaderData { +pub struct XLogPageHeaderData { xlp_magic: u16, /* magic value for correctness checks */ xlp_info: u16, /* flag bits, see below */ xlp_tli: u32, /* TimeLineID of first record on page */ @@ -33,7 +29,7 @@ const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4; #[repr(C)] #[derive(Debug)] -struct XLogLongPageHeaderData { +pub struct XLogLongPageHeaderData { std: XLogPageHeaderData, /* standard header fields */ xlp_sysid: u64, /* system identifier from pg_control */ xlp_seg_size: u32, /* just as a cross-check */ @@ -57,6 +53,31 @@ pub struct WalStreamDecoder { recordbuf: BytesMut, } + +#[derive(Debug, Clone)] +pub struct WalDecodeError { + msg: String, +} + +impl Error for WalDecodeError { + fn description(&self) -> &str { + &self.msg + } +} +impl WalDecodeError { + fn new(msg: &str) -> WalDecodeError { + WalDecodeError { + msg: msg.to_string(), + } + } +} + +impl fmt::Display for WalDecodeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "WAL decoding error: {}", self.msg) + } +} + // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream @@ -81,38 +102,46 @@ impl WalStreamDecoder { // Returns a tuple: // (end LSN, record) - pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> { + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { loop { // parse and verify page boundaries as we go if self.lsn % WAL_SEGMENT_SIZE == 0 { // parse long header if self.inputbuf.remaining() < SizeOfXLogLongPHD { - return None; + return Ok(None); } - self.decode_XLogLongPageHeaderData(); + let hdr = self.decode_XLogLongPageHeaderData(); + if hdr.std.xlp_pageaddr != self.lsn { + return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + self.lsn >> 32, + self.lsn & 0xffffffff))); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogLongPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.lsn % (XLOG_BLCKSZ as u64) == 0 { // parse page header if self.inputbuf.remaining() < SizeOfXLogShortPHD { - return None; + return Ok(None); } - self.decode_XLogPageHeaderData(); + let hdr = self.decode_XLogPageHeaderData(); + if hdr.xlp_pageaddr != self.lsn { + return Err(WalDecodeError::new(&format!("invalid xlog page header at {:X}/{:X}", + self.lsn >> 32, + self.lsn & 0xffffffff))); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogShortPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.padlen > 0 { if self.inputbuf.remaining() < self.padlen as usize { - return None; + return Ok(None); } // skip padding @@ -123,20 +152,17 @@ impl WalStreamDecoder { // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { - return None; + return Ok(None); } // read xl_tot_len FIXME: assumes little-endian self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - error!( - "invalid xl_tot_len {} at {:X}/{:X}", - xl_tot_len, - self.lsn >> 32, - self.lsn & 0xffffffff - ); - panic!(); + return Err(WalDecodeError::new(&format!("invalid xl_tot_len {} at {:X}/{:X}", + xl_tot_len, + self.lsn >> 32, + self.lsn & 0xffffffff))); } self.lsn += 4; @@ -154,7 +180,7 @@ impl WalStreamDecoder { let n = min(self.contlen, pageleft) as usize; if self.inputbuf.remaining() < n { - return None; + return Ok(None); } self.recordbuf.put(self.inputbuf.split_to(n)); @@ -182,7 +208,7 @@ impl WalStreamDecoder { } let result = (self.lsn, recordbuf); - return Some(result); + return Ok(Some(result)); } continue; } @@ -289,7 +315,6 @@ pub struct DecodedBkpBlock { const SizeOfXLogRecord: u32 = 24; pub struct DecodedWALRecord { - pub lsn: u64, // LSN at the *end* of the record pub record: Bytes, // raw XLogRecord pub blocks: Vec, @@ -321,14 +346,7 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool { // // Routines to decode a WAL record and figure out which blocks are modified // -pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { - trace!( - "decoding record with LSN {:08X}/{:08X} ({} bytes)", - lsn >> 32, - lsn & 0xffff_ffff, - rec.remaining() - ); - +pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord { let mut buf = rec.clone(); // FIXME: assume little-endian here @@ -584,7 +602,6 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { // Since we don't care about the data payloads here, we're done. return DecodedWALRecord { - lsn, record: rec, blocks, }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 5ca5ffa199..3f8fcd8722 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -10,22 +10,72 @@ use crate::page_cache; use crate::page_cache::BufferTag; use crate::waldecoder::{decode_wal_record, WalStreamDecoder}; use crate::PageServerConf; +use crate::ZTimelineId; use anyhow::Error; +use lazy_static::lazy_static; use log::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; +use std::collections::HashMap; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::{Write, Seek, SeekFrom}; +use std::path::PathBuf; use std::str::FromStr; +use std::sync::Mutex; +use std::thread; use tokio::runtime; use tokio::time::{sleep, Duration}; use tokio_postgres::replication::{PgTimestamp, ReplicationStream}; use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow}; use tokio_stream::StreamExt; +// +// We keep one WAL Receiver active per timeline. +// +struct WalReceiverEntry { + wal_producer_connstr: String, +} + +lazy_static! { + static ref WAL_RECEIVERS: Mutex> = Mutex::new(HashMap::new()); +} + +// Launch a new WAL receiver, or tell one that's running about change in connection string +pub fn launch_wal_receiver(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + + match receivers.get_mut(&timelineid) { + Some(receiver) => { + receiver.wal_producer_connstr = wal_producer_connstr.into(); + } + None => { + let receiver = WalReceiverEntry { wal_producer_connstr: wal_producer_connstr.into() }; + receivers.insert(timelineid, receiver); + + // Also launch a new thread to handle this connection + let conf_copy = conf.clone(); + let _walreceiver_thread = thread::Builder::new() + .name("WAL receiver thread".into()) + .spawn(move || { + thread_main(&conf_copy, timelineid); + }).unwrap(); + } + }; +} + +// Look up current WAL producer connection string in the hash table +fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { + let receivers = WAL_RECEIVERS.lock().unwrap(); + + receivers.get(&timelineid).unwrap().wal_producer_connstr.clone() +} + // // This is the entry point for the WAL receiver thread. // -pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { - info!("WAL receiver thread started: '{}'", wal_producer_connstr); +fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("WAL receiver thread started for timeline : '{}'", timelineid); let runtime = runtime::Builder::new_current_thread() .enable_all() @@ -34,7 +84,10 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { runtime.block_on(async { loop { - let res = walreceiver_main(conf, wal_producer_connstr).await; + // Look up the current WAL producer address + let wal_producer_connstr = get_wal_producer_connstr(timelineid); + + let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await; if let Err(e) = res { info!( @@ -47,7 +100,7 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { }); } -async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> Result<(), Error> { +async fn walreceiver_main(conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str) -> Result<(), Error> { // Connect to the database in replication mode. info!("connecting to {:?}", wal_producer_connstr); let connect_cfg = format!("{} replication=true", wal_producer_connstr); @@ -67,7 +120,7 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let end_of_wal = u64::from(identify.xlogpos); let mut caught_up = false; - let pcache = page_cache::get_pagecache(conf, identify.systemid); + let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap(); // // Start streaming the WAL, from where we left off previously. @@ -95,9 +148,10 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> } } debug!( - "starting replication from {:X}/{:X}, server is at {:X}/{:X}...", + "starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...", (startpoint >> 32), (startpoint & 0xffffffff), + timelineid, (end_of_wal >> 32), (end_of_wal & 0xffffffff) ); @@ -120,6 +174,11 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let startlsn = xlog_data.wal_start(); let endlsn = startlsn + data.len() as u64; + write_wal_file(startlsn, + timelineid, + 16 * 1024 * 1024, // FIXME + data)?; + trace!( "received XLogData between {:X}/{:X} and {:X}/{:X}", (startlsn >> 32), @@ -131,8 +190,8 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> waldecoder.feed_bytes(data); loop { - if let Some((lsn, recdata)) = waldecoder.poll_decode() { - let decoded = decode_wal_record(startlsn, recdata.clone()); + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let decoded = decode_wal_record(recdata.clone()); // Put the WAL record to the page cache. We make a separate copy of // it for every block it modifies. (The actual WAL record is kept in @@ -260,3 +319,153 @@ pub async fn identify_system(client: &tokio_postgres::Client) -> Result u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + +#[allow(non_snake_case)] +pub fn XLogSegNoOffsetToRecPtr( + segno: XLogSegNo, + offset: u32, + wal_segsz_bytes: usize, +) -> XLogRecPtr { + return segno * (wal_segsz_bytes as u64) + (offset as u64); +} + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + + +fn write_wal_file( + startpos: XLogRecPtr, + timeline: ZTimelineId, + wal_seg_size: usize, + buf: &[u8], +) -> anyhow::Result<()> { + let mut bytes_left: usize = buf.len(); + let mut bytes_written: usize = 0; + let mut partial; + let mut start_pos = startpos; + const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ]; + + let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline)); + + /* Extract WAL location for this block */ + let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize; + + while bytes_left != 0 { + let bytes_to_write; + + /* + * If crossing a WAL boundary, only write up until we reach wal + * segment size. + */ + if xlogoff + bytes_left > wal_seg_size { + bytes_to_write = wal_seg_size - xlogoff; + } else { + bytes_to_write = bytes_left; + } + + /* Open file */ + let segno = XLByteToSeg(start_pos, wal_seg_size); + let wal_file_name = XLogFileName(1, // FIXME: always use Postgres timeline 1 + segno, wal_seg_size); + let wal_file_path = wal_dir + .join(wal_file_name.clone()); + let wal_file_partial_path = wal_dir + .join(wal_file_name.clone() + ".partial"); + + { + let mut wal_file: File; + /* Try to open already completed segment */ + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + wal_file = file; + partial = false; + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) + { + /* Try to open existed partial file */ + wal_file = file; + partial = true; + } else { + /* Create and fill new partial file */ + partial = true; + match OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + { + Ok(mut file) => { + for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { + file.write_all(&ZERO_BLOCK)?; + } + wal_file = file; + } + Err(e) => { + error!("Failed to open log file {:?}: {}", &wal_file_path, e); + return Err(e.into()); + } + } + } + wal_file.seek(SeekFrom::Start(xlogoff as u64))?; + wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; + + // FIXME: Flush the file + //wal_file.sync_all()?; + } + /* Write was successful, advance our position */ + bytes_written += bytes_to_write; + bytes_left -= bytes_to_write; + start_pos += bytes_to_write as u64; + xlogoff += bytes_to_write; + + /* Did we reach the end of a WAL segment? */ + if XLogSegmentOffset(start_pos, wal_seg_size) == 0 { + xlogoff = 0; + if partial { + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } + } + } + Ok(()) +} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e3a0510080..9b0010a1be 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,7 +22,7 @@ use std::io::Error; use std::sync::Arc; use std::time::Duration; use std::time::Instant; -use std::{path::PathBuf, process::Stdio}; +use std::process::Stdio; use tokio::io::AsyncBufReadExt; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::process::{Child, ChildStdin, ChildStdout, Command}; @@ -35,14 +35,15 @@ use crate::page_cache; use crate::page_cache::CacheEntry; use crate::page_cache::WALRecord; use crate::{page_cache::BufferTag, PageServerConf}; +use crate::ZTimelineId; static TIMEOUT: Duration = Duration::from_secs(20); // // Main entry point for the WAL applicator thread. // -pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { - info!("WAL redo thread started {}", sys_id); +pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("WAL redo thread started {}", timelineid); // We block on waiting for requests on the walredo request channel, but // use async I/O to communicate with the child process. Initialize the @@ -52,15 +53,15 @@ pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { .build() .unwrap(); - let pcache = page_cache::get_pagecache(conf, sys_id); + let pcache = page_cache::get_pagecache(conf, timelineid).unwrap(); // Loop forever, handling requests as they come. let walredo_channel_receiver = &pcache.walredo_receiver; loop { let mut process: WalRedoProcess; - let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id)); + let datadir = format!("wal-redo/{}", timelineid); - info!("launching WAL redo postgres process {}", sys_id); + info!("launching WAL redo postgres process {}", timelineid); { let _guard = runtime.enter(); process = WalRedoProcess::launch(&datadir, &runtime).unwrap(); @@ -147,13 +148,13 @@ impl WalRedoProcess { // Tests who run pageserver binary are setting proper PG_BIN_DIR // and PG_LIB_DIR so that WalRedo would start right postgres. We may later // switch to setting same things in pageserver config file. - fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result { + fn launch(datadir: &str, runtime: &Runtime) -> Result { // Create empty data directory for wal-redo postgres deleting old one. - fs::remove_dir_all(datadir.to_str().unwrap()).ok(); + fs::remove_dir_all(datadir).ok(); let initdb = runtime .block_on( Command::new("initdb") - .args(&["-D", datadir.to_str().unwrap()]) + .args(&["-D", datadir]) .arg("-N") .output(), ) @@ -173,14 +174,11 @@ impl WalRedoProcess { .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) - .env("PGDATA", datadir.to_str().unwrap()) + .env("PGDATA", datadir) .spawn() .expect("postgres --wal-redo command failed to start"); - info!( - "launched WAL redo postgres process on {}", - datadir.to_str().unwrap() - ); + info!("launched WAL redo postgres process on {}", datadir); let stdin = child.stdin.take().expect("failed to open child's stdin"); let stderr = child.stderr.take().expect("failed to open child's stderr"); diff --git a/vendor/postgres b/vendor/postgres index d143241a16..5eaf718d3f 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit d143241a1653d3825d94d645801c62c7755b1015 +Subproject commit 5eaf718d3f2fae700fb4902326a4c1d2cee87b51 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index 98c63c434f..27498ee293 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -34,3 +34,6 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } anyhow = "1.0" crc32c = "0.6.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index d50467ba49..00576f055e 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -13,6 +13,8 @@ use clap::{App, Arg}; use slog::Drain; +use pageserver::ZTimelineId; + use walkeeper::wal_service; use walkeeper::WalAcceptorConf; @@ -26,6 +28,12 @@ fn main() -> Result<(), io::Error> { .takes_value(true) .help("Path to the WAL acceptor data directory"), ) + .arg( + Arg::with_name("timelineid") + .long("timelineid") + .takes_value(true) + .help("zenith timeline id"), + ) .arg( Arg::with_name("listen") .short("l") @@ -58,6 +66,7 @@ fn main() -> Result<(), io::Error> { let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), + timelineid: ZTimelineId::from([0u8; 16]), daemonize: false, no_sync: false, pageserver_addr: None, @@ -68,6 +77,10 @@ fn main() -> Result<(), io::Error> { conf.data_dir = PathBuf::from(dir); } + if let Some(timelineid_str) = arg_matches.value_of("timelineid") { + conf.timelineid = ZTimelineId::from_str(timelineid_str).unwrap(); + } + if arg_matches.is_present("no-sync") { conf.no_sync = true; } @@ -98,7 +111,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { info!("daemonizing..."); // There should'n be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fpritf's or backtraces. + // that we will see any accidental manual fprintf's or backtraces. let stdout = OpenOptions::new() .create(true) .append(true) diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 7e890cf98a..5f2f557b49 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -6,9 +6,12 @@ mod pq_protocol; pub mod wal_service; pub mod xlog_utils; +use pageserver::ZTimelineId; + #[derive(Debug, Clone)] pub struct WalAcceptorConf { pub data_dir: PathBuf, + pub timelineid: ZTimelineId, pub daemonize: bool, pub no_sync: bool, pub listen_addr: SocketAddr, diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 6e17f41f06..1a8f764598 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -563,7 +563,8 @@ impl Connection { "no_user", ); let callme = format!( - "callmemaybe host={} port={} replication=1 options='-c system.id={}'", + "callmemaybe {} host={} port={} replication=1 options='-c system.id={}'", + self.conf.timelineid, self.conf.listen_addr.ip(), self.conf.listen_addr.port(), self.system().get_info().server.system_id, diff --git a/walkeeper/src/xlog_utils.rs b/walkeeper/src/xlog_utils.rs index 51db9681a6..7c18131186 100644 --- a/walkeeper/src/xlog_utils.rs +++ b/walkeeper/src/xlog_utils.rs @@ -4,7 +4,7 @@ use log::*; use std::cmp::min; use std::fs::{self, File}; use std::io::prelude::*; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::SystemTime; pub const XLOG_FNAME_LEN: usize = 24; @@ -89,7 +89,7 @@ pub fn get_current_timestamp() -> TimestampTz { } fn find_end_of_wal_segment( - data_dir: &PathBuf, + data_dir: &Path, segno: XLogSegNo, tli: TimeLineID, wal_seg_size: usize, @@ -185,7 +185,7 @@ fn find_end_of_wal_segment( } pub fn find_end_of_wal( - data_dir: &PathBuf, + data_dir: &Path, wal_seg_size: usize, precise: bool, ) -> (XLogRecPtr, TimeLineID) { diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 2d1f7c922c..035fcc9d94 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -8,4 +8,10 @@ edition = "2018" [dependencies] clap = "2.33.0" +anyhow = "1.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 5e34c655b9..de29f386a0 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,29 +1,66 @@ -use clap::{App, Arg, ArgMatches, SubCommand}; -use std::error; +use std::fs; +use std::path::{Path, PathBuf}; use std::process::exit; +use clap::{App, Arg, ArgMatches, SubCommand}; +use anyhow::Result; +use anyhow::*; + use control_plane::{compute::ComputeControlPlane, local_env, storage}; +use control_plane::local_env::LocalEnv; +use control_plane::storage::PageServerNode; -type Result = std::result::Result>; +use pageserver::ZTimelineId; -fn main() { +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + +// Main entry point for the 'zenith' CLI utility +// +// This utility can used to work with a local zenith repository. +// In order to run queries in it, you need to launch the page server, +// and a compute node against the page server +fn main() -> Result<()> { let name_arg = Arg::with_name("NAME") .short("n") .index(1) .help("name of this postgres instance") .required(true); let matches = App::new("zenith") - .subcommand(SubCommand::with_name("init")) - .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) - .subcommand(SubCommand::with_name("status")) + .about("Zenith CLI") + .subcommand(SubCommand::with_name("init") + .about("Initialize a new Zenith repository in current directory")) + .subcommand(SubCommand::with_name("branch") + .about("Create a new branch") + .arg(Arg::with_name("branchname") + .required(false) + .index(1)) + .arg(Arg::with_name("start-point") + .required(false) + .index(2))) + .subcommand( + SubCommand::with_name("pageserver") + .about("Manage pageserver instance") + .subcommand(SubCommand::with_name("status")) + .subcommand(SubCommand::with_name("start")) + .subcommand(SubCommand::with_name("stop")) + ) .subcommand( SubCommand::with_name("pg") .about("Manage postgres instances") .subcommand( - SubCommand::with_name("create"), // .arg(name_arg.clone() - // .required(false) - // .help("name of this postgres instance (will be pgN if omitted)")) + SubCommand::with_name("create") + // .arg(name_arg.clone() + // .required(false) + // .help("name of this postgres instance (will be pgN if omitted)")) + .arg(Arg::with_name("timeline") + .required(false) + .index(1)) ) .subcommand(SubCommand::with_name("list")) .subcommand(SubCommand::with_name("start").arg(name_arg.clone())) @@ -33,24 +70,24 @@ fn main() { .get_matches(); // handle init separately and exit - if let Some("init") = matches.subcommand_name() { - match local_env::init() { - Ok(_) => { - println!("Initialization complete! You may start zenith with 'zenith start' now."); - exit(0); - } - Err(e) => { - eprintln!("Error during init: {}", e); - exit(1); - } - } + if let ("init", Some(sub_args)) = matches.subcommand() { + run_init_cmd(sub_args.clone())?; + exit(0); } // all other commands would need config - let env = match local_env::load_config() { + + let repopath = PathBuf::from(zenith_repo_dir()); + if !repopath.exists() { + bail!("Zenith repository does not exists in {}.\n\ + Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'", + repopath.display()); + } + // TODO: check that it looks like a zenith repository + let env = match local_env::load_config(&repopath) { Ok(conf) => conf, Err(e) => { - eprintln!("Error loading config from ~/.zenith: {}", e); + eprintln!("Error loading config from {}: {}", repopath.display(), e); exit(1); } }; @@ -60,6 +97,9 @@ fn main() { panic!() /* Should not happen. Init was handled before */ } + ("branch", Some(sub_args)) => run_branch_cmd(&env, sub_args.clone())?, + ("pageserver", Some(sub_args)) => run_pageserver_cmd(&env, sub_args.clone())?, + ("start", Some(_sub_m)) => { let pageserver = storage::PageServerNode::from_env(&env); @@ -86,15 +126,53 @@ fn main() { } } _ => {} - } + }; + + Ok(()) +} + +fn run_pageserver_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + match args.subcommand() { + ("status", Some(_sub_m)) => { + todo!(); + } + ("start", Some(_sub_m)) => { + let psnode = PageServerNode::from_env(local_env); + psnode.start()?; + println!("Page server started"); + } + ("stop", Some(_sub_m)) => { + todo!(); + } + _ => unreachable!(), + }; + + Ok(()) +} + +// Peek into the repository, to grab the timeline ID of given branch +pub fn get_branch_timeline(repopath: &Path, branchname: &str) -> ZTimelineId { + let branchpath = repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() } fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; match pg_match.subcommand() { - ("create", Some(_sub_m)) => { - cplane.new_node()?; + ("create", Some(sub_m)) => { + // FIXME: cheat and resolve the timeline by peeking into the + // repository. In reality, when you're launching a compute node + // against a possibly-remote page server, we wouldn't know what + // branches exist in the remote repository. Or would we require + // that you "zenith fetch" them into a local repoitory first? + let timeline_arg = sub_m.value_of("timeline").unwrap_or("main"); + let timeline = get_branch_timeline(&env.repo_path, timeline_arg); + + println!("Initializing Postgres on timeline {}...", timeline); + + cplane.new_node(timeline)?; } ("list", Some(_sub_m)) => { println!("NODE\tADDRESS\tSTATUS"); @@ -107,7 +185,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.start()?; } ("stop", Some(sub_m)) => { @@ -115,7 +193,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.stop()?; } @@ -124,3 +202,128 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Ok(()) } + + +// "zenith init" - Initialize a new Zenith repository in current dir +fn run_init_cmd(_args: ArgMatches) -> Result<()> { + local_env::init()?; + Ok(()) +} + +// handle "zenith branch" subcommand +fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + let repopath = local_env.repo_path.to_str().unwrap(); + + if let Some(branchname) = args.value_of("branchname") { + if PathBuf::from(format!("{}/refs/branches/{}", repopath, branchname)).exists() { + anyhow::bail!("branch {} already exists", branchname); + } + + if let Some(startpoint_str) = args.value_of("start-point") { + + let mut startpoint = parse_point_in_time(startpoint_str)?; + + if startpoint.lsn == 0 { + // Find end of WAL on the old timeline + let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?; + + println!("branching at end of WAL: {:X}/{:X}", end_of_wal >> 32, end_of_wal & 0xffffffff); + + startpoint.lsn = end_of_wal; + } + + return local_env::create_branch(local_env, branchname, startpoint); + + } else { + panic!("Missing start-point"); + } + } else { + // No arguments, list branches + list_branches(); + } + Ok(()) +} + +fn list_branches() { + // list branches + let paths = fs::read_dir(zenith_repo_dir() + "/refs/branches").unwrap(); + + for path in paths { + let filename = path.unwrap().file_name().to_str().unwrap().to_owned(); + println!(" {}", filename); + } +} + +// +// Parse user-given string that represents a point-in-time. +// +// We support multiple variants: +// +// Raw timeline id in hex, meaning the end of that timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d +// +// A specific LSN on a timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 +// +// Same, with a human-friendly branch name: +// main +// main@2/15D3DD8 +// +// Human-friendly tag name: +// mytag +// +// +fn parse_point_in_time(s: &str) -> Result { + + let mut strings = s.split("@"); + let name = strings.next().unwrap(); + + let lsn: Option; + if let Some(lsnstr) = strings.next() { + let mut s = lsnstr.split("/"); + let lsn_hi: u64 = s.next().unwrap().parse()?; + let lsn_lo: u64 = s.next().unwrap().parse()?; + lsn = Some(lsn_hi << 32 | lsn_lo); + } + else { + lsn = None + } + + // Check if it's a tag + if lsn.is_none() { + let tagpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/tags/" + name); + if tagpath.exists() { + let pointstr = fs::read_to_string(tagpath)?; + + return parse_point_in_time(&pointstr); + } + } + // Check if it's a branch + // Check if it's branch @ LSN + let branchpath:PathBuf = PathBuf::from(zenith_repo_dir() + "/refs/branches/" + name); + if branchpath.exists() { + let pointstr = fs::read_to_string(branchpath)?; + + let mut result = parse_point_in_time(&pointstr)?; + if lsn.is_some() { + result.lsn = lsn.unwrap(); + } else { + result.lsn = 0; + } + return Ok(result); + } + + // Check if it's a timelineid + // Check if it's timelineid @ LSN + let tlipath:PathBuf = PathBuf::from(zenith_repo_dir() + "/timelines/" + name); + if tlipath.exists() { + let result = local_env::PointInTime { + timelineid: ZTimelineId::from_str(name)?, + lsn: lsn.unwrap_or(0) + }; + + return Ok(result); + } + + panic!("could not parse point-in-time {}", s); +}