diff --git a/Cargo.lock b/Cargo.lock index be86750a77..a16bd155c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -91,19 +91,19 @@ dependencies = [ [[package]] name = "async-io" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9315f8f07556761c3e48fec2e6b276004acf426e6dc068b2c2251854d65ee0fd" +checksum = "fcb9af4888a70ad78ecb5efcb0ba95d66a3cf54a88b62ae81559954c7588c7a2" dependencies = [ "concurrent-queue", "fastrand", "futures-lite", "libc", "log", - "nb-connect", "once_cell", "parking", "polling", + "socket2", "vec-arena", "waker-fn", "winapi", @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "async-lock" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1996609732bde4a9988bc42125f55f2af5f3c36370e27c778d5191a4a1b63bfb" +checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b" dependencies = [ "event-listener", ] @@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0" [[package]] name = "async-trait" -version = "0.1.48" +version = "0.1.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea56748e10732c49404c153638a15ec3d6211ec5ff35d9bb20e13b93576adf" +checksum = "0b98e84bbb4cbcdd97da190ba0c58a1bb0de2c1fdf67d159e192ed766aeca722" dependencies = [ "proc-macro2", "quote", @@ -250,14 +250,18 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", + "clap", + "env_logger", "lazy_static", "lazycell", + "log", "peeking_take_while", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", + "which", ] [[package]] @@ -424,15 +428,22 @@ checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" name = "control_plane" version = "0.1.0" dependencies = [ - "home", + "anyhow", + "bytes", + "fs_extra", + "hex", "lazy_static", + "pageserver", "postgres", + "postgres_ffi", "rand 0.8.3", "regex", "serde", "serde_derive", + "tar", "tokio-postgres", "toml", + "walkeeper", ] [[package]] @@ -468,9 +479,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -585,6 +596,19 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "env_logger" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + [[package]] name = "event-listener" version = "2.5.1" @@ -606,6 +630,18 @@ dependencies = [ "instant", ] +[[package]] +name = "filetime" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall 0.2.6", + "winapi", +] + [[package]] name = "fnv" version = "1.0.7" @@ -648,10 +684,16 @@ dependencies = [ ] [[package]] -name = "futures" -version = "0.3.13" +name = "fs_extra" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "futures" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" dependencies = [ "futures-channel", "futures-core", @@ -664,9 +706,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" +checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" dependencies = [ "futures-core", "futures-sink", @@ -674,15 +716,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" +checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" [[package]] name = "futures-executor" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" +checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" dependencies = [ "futures-core", "futures-task", @@ -691,9 +733,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" +checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" [[package]] name = "futures-lite" @@ -712,9 +754,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" +checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -724,21 +766,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" +checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" [[package]] name = "futures-task" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" +checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" [[package]] name = "futures-util" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" +checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" dependencies = [ "futures-channel", "futures-core", @@ -858,20 +900,11 @@ dependencies = [ "digest", ] -[[package]] -name = "home" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2456aef2e6b6a9784192ae780c0f15bc57df0e918585282325e8c8ac27737654" -dependencies = [ - "winapi", -] - [[package]] name = "http" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" dependencies = [ "bytes", "fnv", @@ -891,9 +924,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.3.5" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" +checksum = "4a1ce40d6fc9764887c2fdc7305c3dcc429ba11ff981c1509416afd5697e4437" [[package]] name = "httpdate" @@ -901,6 +934,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "0.14.5" @@ -940,9 +979,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89829a5d69c23d348314a7ac337fe39173b61149a9864deabd260983aed48c21" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" dependencies = [ "matches", "unicode-bidi", @@ -974,9 +1013,11 @@ version = "0.1.0" dependencies = [ "control_plane", "lazy_static", + "pageserver", "postgres", "rand 0.8.3", "tokio-postgres", + "walkeeper", ] [[package]] @@ -993,9 +1034,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "jobserver" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" +checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd" dependencies = [ "libc", ] @@ -1032,9 +1073,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56d855069fafbb9b344c0f962150cd2c1187975cb1c22c1522c240d8c4986714" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" [[package]] name = "libloading" @@ -1049,7 +1090,7 @@ dependencies = [ [[package]] name = "librocksdb-sys" version = "6.17.3" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#0b700fe70da8ee30483fde79f44df549f8fe11ec" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#7dd6258b07861b9332f827b416e50e5aee69aea1" dependencies = [ "bindgen", "cc", @@ -1162,16 +1203,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "nb-connect" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19900e7eee95eb2b3c2e26d12a874cc80aaf750e31be6fcbe743ead369fa45d" -dependencies = [ - "libc", - "socket2", -] - [[package]] name = "nom" version = "5.1.2" @@ -1294,12 +1325,15 @@ dependencies = [ "crossbeam-channel", "daemonize", "fs2", + "fs_extra", "futures", + "hex", "lazy_static", "log", "postgres", "postgres-protocol", "postgres-types", + "postgres_ffi", "rand 0.8.3", "regex", "rocksdb", @@ -1309,6 +1343,7 @@ dependencies = [ "slog-scope", "slog-stdlog", "slog-term", + "tar", "termion", "thiserror", "tokio", @@ -1344,7 +1379,7 @@ dependencies = [ "cfg-if 1.0.0", "instant", "libc", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "smallvec", "winapi", ] @@ -1381,18 +1416,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc174859768806e91ae575187ada95c91a29e96a98dc5d2cd9a1fed039501ba6" +checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a490329918e856ed1b083f244e3bfe2d8c4f336407e4ea9e1a9f479ff09049e5" +checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f" dependencies = [ "proc-macro2", "quote", @@ -1470,6 +1505,20 @@ dependencies = [ "postgres-protocol", ] +[[package]] +name = "postgres_ffi" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "byteorder", + "bytes", + "chrono", + "crc32c", + "hex", + "rand 0.8.3", +] + [[package]] name = "ppv-lite86" version = "0.2.10" @@ -1595,9 +1644,9 @@ checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" dependencies = [ "bitflags", ] @@ -1608,7 +1657,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8440d8acb4fd3d277125b4bd01a6f38aee8d814b3b5fc09b3f2b825d37d3fe8f" dependencies = [ - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1629,7 +1678,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ "getrandom 0.2.2", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", ] [[package]] @@ -1660,9 +1709,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" dependencies = [ "base64", "bytes", @@ -1695,8 +1744,8 @@ dependencies = [ [[package]] name = "rocksdb" -version = "0.15.0" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#0b700fe70da8ee30483fde79f44df549f8fe11ec" +version = "0.16.0" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#7dd6258b07861b9332f827b416e50e5aee69aea1" dependencies = [ "libc", "librocksdb-sys", @@ -1939,9 +1988,9 @@ checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27" [[package]] name = "slab" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" [[package]] name = "slog" @@ -2036,9 +2085,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ce15dd3ed8aa2f8eeac4716d6ef5ab58b6b9256db41d7e1a0224c2788e8fd87" +checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb" dependencies = [ "proc-macro2", "quote", @@ -2051,6 +2100,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" +[[package]] +name = "tar" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0bcfbd6a598361fda270d82469fff3d65089dc33e175c9a131f7b4cd395f228" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.2.0" @@ -2060,7 +2120,7 @@ dependencies = [ "cfg-if 1.0.0", "libc", "rand 0.8.3", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "remove_dir_all", "winapi", ] @@ -2076,6 +2136,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + [[package]] name = "termion" version = "1.5.6" @@ -2084,7 +2153,7 @@ checksum = "077185e2eac69c3f8379a4298e1e07cd36beb962290d4a51199acf0fdc10607e" dependencies = [ "libc", "numtoa", - "redox_syscall 0.2.5", + "redox_syscall 0.2.6", "redox_termios", ] @@ -2154,9 +2223,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134af885d758d645f0f0505c9a8b3f9bf8a348fd822e112ab5248138348f1722" +checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" dependencies = [ "autocfg", "bytes", @@ -2228,9 +2297,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f" +checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" dependencies = [ "bytes", "futures-core", @@ -2302,9 +2371,9 @@ checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" [[package]] name = "unicode-bidi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" dependencies = [ "matches", ] @@ -2359,9 +2428,9 @@ dependencies = [ [[package]] name = "vcpkg" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b00bca6106a5e23f3eee943593759b7fcddb00554332e856d990c893966879fb" +checksum = "cbdbff6266a24120518560b5dc983096efb98462e51d0d68169895b237be3e5d" [[package]] name = "vec-arena" @@ -2414,6 +2483,7 @@ dependencies = [ "futures", "lazy_static", "log", + "pageserver", "postgres", "postgres-protocol", "rand 0.8.3", @@ -2540,6 +2610,15 @@ dependencies = [ "cc", ] +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +dependencies = [ + "libc", +] + [[package]] name = "wildmatch" version = "1.1.0" @@ -2586,6 +2665,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "xattr" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +dependencies = [ + "libc", +] + [[package]] name = "xml-rs" version = "0.8.3" @@ -2596,6 +2684,14 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" name = "zenith" version = "0.1.0" dependencies = [ + "anyhow", "clap", "control_plane", + "pageserver", + "postgres_ffi", + "walkeeper", ] + +[[package]] +name = "zenith_utils" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index f4d6314283..d242faaaee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,6 @@ members = [ "walkeeper", "zenith", "control_plane", + "postgres_ffi", + "zenith_utils", ] diff --git a/README.md b/README.md index b7c745bcb8..2836a71604 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ cargo build >./target/debug/zenith init # start pageserver -> ./target/debug/zenith start +> ./target/debug/zenith pageserver start Starting pageserver at '127.0.0.1:64000' # create and configure postgres data dir diff --git a/cli-v2-story.md b/cli-v2-story.md new file mode 100644 index 0000000000..1f213c903b --- /dev/null +++ b/cli-v2-story.md @@ -0,0 +1,188 @@ +Create a new Zenith repository in the current directory: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init + The files belonging to this database system will be owned by user "heikki". + This user must also own the server process. + + The database cluster will be initialized with locale "en_GB.UTF-8". + The default database encoding has accordingly been set to "UTF8". + The default text search configuration will be set to "english". + + Data page checksums are disabled. + + creating directory tmp ... ok + creating subdirectories ... ok + selecting dynamic shared memory implementation ... posix + selecting default max_connections ... 100 + selecting default shared_buffers ... 128MB + selecting default time zone ... Europe/Helsinki + creating configuration files ... ok + running bootstrap script ... ok + performing post-bootstrap initialization ... ok + syncing data to disk ... ok + + initdb: warning: enabling "trust" authentication for local connections + You can change this by editing pg_hba.conf or using the option -A, or + --auth-local and --auth-host, the next time you run initdb. + new zenith repository was created in .zenith + +Initially, there is only one branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + main + +Start a local Postgres instance on the branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432 + 2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432 + 2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432" + 2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status" + 2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0 + 2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required + 2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections + done + server started + +Run some commands against it: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" + CREATE TABLE + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + +Create a new branch called 'experimental'. We create it from the +current end of the 'main' branch, but you could specify a different +LSN as the start point instead. + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main + branching at end of WAL: 0/161F478 + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch + experimental + main + +Start another Postgres instance off the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0 + 2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections + done + server started + +Insert some a row on the 'experimental' branch: + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" + INSERT 0 1 + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + +See that the other Postgres instance is still running on 'main' branch on port 5432: + + + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + (1 row) + + + + +Everything is stored in the .zenith directory: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/ + total 12 + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines + +The 'datadirs' directory contains the datadirs of the running instances: + + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/ + total 8 + drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e + drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76 + ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/ + total 124 + drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem + -rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf + -rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical + drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase + -rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION + lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal + drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact + -rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf + -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf + -rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts + -rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid + +Note how 'pg_wal' is just a symlink to the 'timelines' directory. The +datadir is ephemeral, you can delete it at any time, and it can be reconstructed +from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull +the repository, the 'datadirs' are not included. (They are like git working trees) + + ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres + ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/* + ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433 + Creating data directory from snapshot at 0/15FFB08... + waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433 + 2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433 + 2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433" + 2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST + 2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress + 2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80 + 2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0 + 2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s + 2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections + done + server started + ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" + t + ----------------------------- + inserted on the main branch + inserted on experimental + (2 rows) + diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 7281595c18..0d49488bd7 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,12 +8,20 @@ edition = "2018" [dependencies] rand = "0.8.3" +tar = "0.4.33" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } serde = "" serde_derive = "" toml = "" -home = "0.5.3" lazy_static = "" regex = "1" +anyhow = "1.0" +hex = "0.4.3" +bytes = "1.0.1" +fs_extra = "1.2.0" + +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index bfe38ef528..c336792b83 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,22 +1,24 @@ -use std::error; -use std::fs::File; use std::fs::{self, OpenOptions}; +use std::io::{Read, Write}; +use std::net::SocketAddr; use std::net::TcpStream; -use std::process::{Command, Stdio}; +use std::os::unix::fs::PermissionsExt; +use std::process::Command; use std::sync::Arc; use std::time::Duration; use std::{collections::BTreeMap, path::PathBuf}; -use std::{io::Write, net::SocketAddr}; use std::path::Path; +use anyhow::{Context, Result}; use lazy_static::lazy_static; -use postgres::{Client, NoTls}; use regex::Regex; +use tar; -use crate::local_env::{self, LocalEnv}; +use postgres::{Client, NoTls}; + +use crate::local_env::LocalEnv; use crate::storage::{PageServerNode, WalProposerNode}; - -type Result = std::result::Result>; +use pageserver::ZTimelineId; // // ComputeControlPlane @@ -35,14 +37,9 @@ impl ComputeControlPlane { // it is running on default port. Change that when pageserver will have config. let pageserver = Arc::new(PageServerNode::from_env(&env)); - let nodes: Result> = fs::read_dir(env.compute_dir()) - .map_err(|e| { - format!( - "failed to list {}: {}", - env.compute_dir().to_str().unwrap(), - e - ) - })? + let pgdatadirspath = env.repo_path.join("pgdatadirs"); + let nodes: Result> = fs::read_dir(&pgdatadirspath) + .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? .into_iter() .map(|f| { PostgresNode::from_dir_entry(f?, &env, &pageserver) @@ -68,43 +65,50 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - pub fn local(pageserver: &Arc) -> ComputeControlPlane { - let env = local_env::test_env(); + pub fn local(local_env: &LocalEnv, pageserver: &Arc) -> ComputeControlPlane { ComputeControlPlane { base_port: 65431, pageserver: Arc::clone(pageserver), nodes: BTreeMap::new(), - env, + env: local_env.clone(), } } - fn new_vanilla_node(&mut self, is_test: bool) -> Result> { - // allocate new node entry with generated port + /// Connect to a page server, get base backup, and untar it to initialize a + /// new data directory + pub fn new_from_page_server( + &mut self, + is_test: bool, + timelineid: ZTimelineId, + ) -> Result> { let node_id = self.nodes.len() as u32 + 1; + let node = Arc::new(PostgresNode { name: format!("pg{}", node_id), address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test, + timelineid, }); - node.init_vanilla()?; + + node.init_from_page_server()?; self.nodes.insert(node.name.clone(), Arc::clone(&node)); Ok(node) } - pub fn new_test_node(&mut self) -> Arc { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(true).unwrap(); + pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid); + assert!(node.is_ok()); + let node = node.unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -112,9 +116,9 @@ impl ComputeControlPlane { node } - pub fn new_test_master_node(&mut self) -> Arc { - let node = self.new_vanilla_node(true).unwrap(); - println!("Create vanilla node at {:?}", node.address); + pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc { + let node = self.new_from_page_server(true, timelineid).unwrap(); + node.append_conf( "postgresql.conf", "synchronous_standby_names = 'safekeeper_proxy'\n", @@ -123,17 +127,15 @@ impl ComputeControlPlane { node } - pub fn new_node(&mut self) -> Result> { - let addr = self.pageserver.address().clone(); - let node = self.new_vanilla_node(false)?; + pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result> { + let node = self.new_from_page_server(false, timelineid).unwrap(); - // Configure that node to take pages from pageserver + // Configure the node to stream WAL directly to the pageserver node.append_conf( "postgresql.conf", format!( - "page_server_connstring = 'host={} port={}'\n", - addr.ip(), - addr.port() + "callmemaybe_connstring = '{}'\n", // FIXME escaping + node.connstr() ) .as_str(), ); @@ -150,6 +152,7 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, + timelineid: ZTimelineId, } impl PostgresNode { @@ -159,11 +162,10 @@ impl PostgresNode { pageserver: &Arc, ) -> Result { if !entry.file_type()?.is_dir() { - let err_msg = format!( + anyhow::bail!( "PostgresNode::from_dir_entry failed: '{}' is not a directory", - entry.path().to_str().unwrap() + entry.path().display() ); - return Err(err_msg.into()); } lazy_static! { @@ -176,11 +178,10 @@ impl PostgresNode { // find out tcp port in config file let cfg_path = entry.path().join("postgresql.conf"); - let config = fs::read_to_string(cfg_path.clone()).map_err(|e| { + let config = fs::read_to_string(cfg_path.clone()).with_context(|| { format!( - "failed to read config file in {}: {}", - cfg_path.to_str().unwrap(), - e + "failed to read config file in {}", + cfg_path.to_str().unwrap() ) })?; @@ -190,14 +191,20 @@ impl PostgresNode { ); let port: u16 = CONF_PORT_RE .captures(config.as_str()) - .ok_or(err_msg.clone() + " 1")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))? .iter() .last() - .ok_or(err_msg.clone() + " 3")? - .ok_or(err_msg.clone() + " 3")? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))? + .ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))? .as_str() .parse() - .map_err(|e| format!("{}: {}", err_msg, e))?; + .with_context(|| err_msg)?; + + // FIXME: What timeline is this server on? Would have to parse the postgresql.conf + // file for that, too. It's currently not needed for anything, but it would be + // nice to list the timeline in "zenith pg list" + let timelineid_buf = [0u8; 16]; + let timelineid = ZTimelineId::from(timelineid_buf); // ok now Ok(PostgresNode { @@ -206,65 +213,107 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, + timelineid, }) } - fn init_vanilla(&self) -> Result<()> { + // Connect to a page server, get base backup, and untar it to initialize a + // new data directory + pub fn init_from_page_server(&self) -> Result<()> { + let pgdata = self.pgdata(); + println!( - "Creating new postgres: path={} port={}", - self.pgdata().to_str().unwrap(), + "Extracting base backup to create postgres instance: path={} port={}", + pgdata.display(), self.address.port() ); // initialize data directory - if self.is_test { - fs::remove_dir_all(self.pgdata().to_str().unwrap()).ok(); + fs::remove_dir_all(&pgdata).ok(); } - fs::create_dir_all(self.pgdata().to_str().unwrap())?; + let sql = format!("basebackup {}", self.timelineid); + let mut client = self + .pageserver + .page_server_psql_client() + .with_context(|| "connecting to page erver failed")?; - let initdb_path = self.env.pg_bin_dir().join("initdb"); - let initdb = Command::new(initdb_path) - .args(&["-D", self.pgdata().to_str().unwrap()]) - .arg("-N") - .arg("-A trust") - .arg("--no-instructions") - .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .status()?; + fs::create_dir_all(&pgdata) + .with_context(|| format!("could not create data directory {}", pgdata.display()))?; + fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context( + || { + format!( + "could not set permissions in data directory {}", + pgdata.display() + ) + }, + )?; - if !initdb.success() { - return Err("initdb failed".into()); - } + // FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive. + // But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that + // we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here. + //fs::create_dir_all(pgdata.join("pg_wal"))?; + + let mut copyreader = client + .copy_out(sql.as_str()) + .with_context(|| "page server 'basebackup' command failed")?; + + // FIXME: Currently, we slurp the whole tarball into memory, and then extract it, + // but we really should do this: + //let mut ar = tar::Archive::new(copyreader); + let mut buf = vec![]; + copyreader + .read_to_end(&mut buf) + .with_context(|| "reading base backup from page server failed")?; + let mut ar = tar::Archive::new(buf.as_slice()); + ar.unpack(&pgdata) + .with_context(|| "extracting page backup failed")?; // listen for selected port self.append_conf( "postgresql.conf", - format!( + &format!( "max_wal_senders = 10\n\ - max_replication_slots = 10\n\ - hot_standby = on\n\ - shared_buffers = 1MB\n\ - fsync = off\n\ - max_connections = 100\n\ - wal_sender_timeout = 0\n\ - wal_level = replica\n\ - listen_addresses = '{address}'\n\ - port = {port}\n", + max_replication_slots = 10\n\ + hot_standby = on\n\ + shared_buffers = 1MB\n\ + fsync = off\n\ + max_connections = 100\n\ + wal_sender_timeout = 0\n\ + wal_level = replica\n\ + listen_addresses = '{address}'\n\ + port = {port}\n", address = self.address.ip(), port = self.address.port() - ) - .as_str(), + ), + ); + + // Never clean up old WAL. TODO: We should use a replication + // slot or something proper, to prevent the compute node + // from removing WAL that hasn't been streamed to the safekeepr or + // page server yet. But this will do for now. + self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n")); + + // Connect it to the page server. + + // Configure that node to take pages from pageserver + self.append_conf( + "postgresql.conf", + &format!( + "page_server_connstring = 'host={} port={}'\n\ + zenith_timeline='{}'\n", + self.pageserver.address().ip(), + self.pageserver.address().port(), + self.timelineid + ), ); - println!("Database initialized"); Ok(()) } - pub fn pgdata(&self) -> PathBuf { - self.env.compute_dir().join(self.name.clone()) + fn pgdata(&self) -> PathBuf { + self.env.repo_path.join("pgdatadirs").join(&self.name) } pub fn status(&self) -> &str { @@ -291,6 +340,7 @@ impl PostgresNode { fn pg_ctl(&self, args: &[&str]) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl = Command::new(pg_ctl_path) .args( [ @@ -306,19 +356,15 @@ impl PostgresNode { ) .env_clear() .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status()?; - + .status() + .with_context(|| "pg_ctl failed")?; if !pg_ctl.success() { - Err("pg_ctl failed".into()) - } else { - Ok(()) + anyhow::bail!("pg_ctl failed"); } + Ok(()) } pub fn start(&self) -> Result<()> { - let _res = self - .pageserver - .page_server_psql(format!("callmemaybe {}", self.connstr()).as_str()); println!("Starting postgres node at '{}'", self.connstr()); self.pg_ctl(&["start"]) } @@ -378,39 +424,21 @@ impl PostgresNode { Client::connect(connstring.as_str(), NoTls).unwrap() } - /* Create stub controlfile and respective xlog to start computenode */ - pub fn setup_controlfile(&self) { - let filepath = format!("{}/global/pg_control", self.pgdata().to_str().unwrap()); - - { - File::create(filepath).unwrap(); - } - - let pg_resetwal_path = self.env.pg_bin_dir().join("pg_resetwal"); - - let pg_resetwal = Command::new(pg_resetwal_path) - .args(&["-D", self.pgdata().to_str().unwrap()]) - .arg("-f") - // TODO probably we will have to modify pg_resetwal - // .arg("--compute-node") - .status() - .expect("failed to execute pg_resetwal"); - - if !pg_resetwal.success() { - panic!("pg_resetwal failed"); - } - } - - pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode { + pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode { let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy"); match Command::new(proxy_path.as_path()) - .args(&["-s", &wal_acceptors]) + .args(&["--ztimelineid", &self.timelineid.to_string()]) + .args(&["-s", wal_acceptors]) .args(&["-h", &self.address.ip().to_string()]) .args(&["-p", &self.address.port().to_string()]) .arg("-v") - .stderr(OpenOptions::new() - .append(true) - .open(self.env.data_dir.join("safepkeeper_proxy.log")).unwrap()) + .stderr( + OpenOptions::new() + .create(true) + .append(true) + .open(self.pgdata().join("safekeeper_proxy.log")) + .unwrap(), + ) .spawn() { Ok(child) => WalProposerNode { pid: child.id() }, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 241fba2f62..adf5d6164c 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -4,14 +4,19 @@ // Now it also provides init method which acts like a stub for proper installation // script which will use local paths. // +use anyhow::Context; +use bytes::Bytes; +use rand::Rng; use std::env; -use std::error; use std::fs; use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use anyhow::Result; use serde_derive::{Deserialize, Serialize}; -type Result = std::result::Result>; +use pageserver::ZTimelineId; +use walkeeper::xlog_utils; // // This data structure represents deserialized zenith config, which should be @@ -21,11 +26,14 @@ type Result = std::result::Result>; // #[derive(Serialize, Deserialize, Clone)] pub struct LocalEnv { - // Here page server and compute nodes will create and store their data. - pub data_dir: PathBuf, + // Path to the Repository. Here page server and compute nodes will create and store their data. + pub repo_path: PathBuf, - // Path to postgres distribution. It expected that "bin", "include", - // "lib", "share" from postgres distribution will be there. If at some point + // System identifier, from the PostgreSQL control file + pub systemid: u64, + + // Path to postgres distribution. It's expected that "bin", "include", + // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. pub pg_distrib_dir: PathBuf, @@ -42,53 +50,37 @@ impl LocalEnv { pub fn pg_lib_dir(&self) -> PathBuf { self.pg_distrib_dir.join("lib") } +} - // pageserver - pub fn pageserver_data_dir(&self) -> PathBuf { - self.data_dir.join("pageserver") - } - pub fn pageserver_log(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.log") - } - pub fn pageserver_pidfile(&self) -> PathBuf { - self.pageserver_data_dir().join("pageserver.pid") - } - - // compute nodes - pub fn compute_dir(&self) -> PathBuf { - self.data_dir.join("compute") +fn zenith_repo_dir() -> PathBuf { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => PathBuf::from(val.to_str().unwrap()), + None => ".zenith".into(), } } // -// Issues in rust-lang repo has several discussions about proper library to check -// home directory in a cross-platform way. Seems that current consensus is around -// home crate and cargo uses it. +// Initialize a new Zenith repository // -fn get_home() -> Result { - home::home_dir().ok_or("can not determine home directory path".into()) -} - pub fn init() -> Result<()> { - let home_dir = get_home()?; - // check if config already exists - let cfg_path = home_dir.join(".zenith"); - if cfg_path.exists() { - let err_msg = format!( + let repo_path = zenith_repo_dir(); + if repo_path.exists() { + anyhow::bail!( "{} already exists. Perhaps already initialized?", - cfg_path.to_str().unwrap() + repo_path.to_str().unwrap() ); - return Err(err_msg.into()); } // Now we can run init only from crate directory, so check that current dir is our crate. // Use 'pageserver/Cargo.toml' existence as evidendce. let cargo_path = env::current_dir()?; if !cargo_path.join("pageserver/Cargo.toml").exists() { - let err_msg = "Current dirrectory does not look like a zenith repo. \ - Please, run 'init' from zenith repo root."; - return Err(err_msg.into()); + anyhow::bail!( + "Current dirrectory does not look like a zenith repo. \ + Please, run 'init' from zenith repo root." + ); } // ok, now check that expected binaries are present @@ -97,81 +89,172 @@ pub fn init() -> Result<()> { let pg_distrib_dir = cargo_path.join("tmp_install"); let pg_path = pg_distrib_dir.join("bin/postgres"); if !pg_path.exists() { - let err_msg = format!( + anyhow::bail!( "Can't find postres binary at {}. \ - Perhaps './pgbuild.sh' is needed to build it first.", + Perhaps './pgbuild.sh' is needed to build it first.", pg_path.to_str().unwrap() ); - return Err(err_msg.into()); } // check pageserver let zenith_distrib_dir = cargo_path.join("target/debug/"); let pageserver_path = zenith_distrib_dir.join("pageserver"); if !pageserver_path.exists() { - let err_msg = format!( + anyhow::bail!( "Can't find pageserver binary at {}. Please build it.", pageserver_path.to_str().unwrap() ); - return Err(err_msg.into()); } // ok, we are good to go - - // create dirs - let data_dir = cargo_path.join("tmp_check_cli"); - - for &dir in &["compute", "pageserver"] { - fs::create_dir_all(data_dir.join(dir)).map_err(|e| { - format!( - "Failed to create directory in '{}': {}", - data_dir.to_str().unwrap(), - e - ) - })?; - } - - // write config - let conf = LocalEnv { - data_dir, + let mut conf = LocalEnv { + repo_path: repo_path.clone(), pg_distrib_dir, zenith_distrib_dir, + systemid: 0, }; - let toml = toml::to_string(&conf)?; - fs::write(cfg_path, toml)?; + init_repo(&mut conf)?; + + Ok(()) +} + +pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> { + let repopath = &local_env.repo_path; + fs::create_dir(&repopath) + .with_context(|| format!("could not create directory {}", repopath.display()))?; + fs::create_dir(repopath.join("pgdatadirs"))?; + fs::create_dir(repopath.join("timelines"))?; + fs::create_dir(repopath.join("refs"))?; + fs::create_dir(repopath.join("refs").join("branches"))?; + fs::create_dir(repopath.join("refs").join("tags"))?; + println!("created directory structure in {}", repopath.display()); + + // Create initial timeline + let tli = create_timeline(&local_env, None)?; + let timelinedir = repopath.join("timelines").join(tli.to_string()); + println!("created initial timeline {}", timelinedir.display()); + + // Run initdb + // + // FIXME: we create it temporarily in "tmp" directory, and move it into + // the repository. Use "tempdir()" or something? Or just create it directly + // in the repo? + let initdb_path = local_env.pg_bin_dir().join("initdb"); + let _initdb = Command::new(initdb_path) + .args(&["-D", "tmp"]) + .arg("--no-instructions") + .env_clear() + .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) + .stdout(Stdio::null()) + .status() + .with_context(|| "failed to execute initdb")?; + println!("initdb succeeded"); + + // Read control file to extract the LSN and system id + let controlfile = + postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?; + let systemid = controlfile.system_identifier; + let lsn = controlfile.checkPoint; + let lsnstr = format!("{:016X}", lsn); + + // Move the initial WAL file + fs::rename( + "tmp/pg_wal/000000010000000000000001", + timelinedir + .join("wal") + .join("000000010000000000000001.partial"), + )?; + println!("moved initial WAL file"); + + // Remove pg_wal + fs::remove_dir_all("tmp/pg_wal")?; + println!("removed tmp/pg_wal"); + + force_crash_recovery(&PathBuf::from("tmp"))?; + println!("updated pg_control"); + + let target = timelinedir.join("snapshots").join(&lsnstr); + fs::rename("tmp", &target)?; + println!("moved 'tmp' to {}", target.display()); + + // Create 'main' branch to refer to the initial timeline + let data = tli.to_string(); + fs::write(repopath.join("refs").join("branches").join("main"), data)?; + println!("created main branch"); + + // Also update the system id in the LocalEnv + local_env.systemid = systemid; + + // write config + let toml = toml::to_string(&local_env)?; + fs::write(repopath.join("config"), toml)?; + + println!( + "new zenith repository was created in {}", + repopath.display() + ); + + Ok(()) +} + +// If control file says the cluster was shut down cleanly, modify it, to mark +// it as crashed. That forces crash recovery when you start the cluster. +// +// FIXME: +// We currently do this to the initial snapshot in "zenith init". It would +// be more natural to do this when the snapshot is restored instead, but we +// currently don't have any code to create new snapshots, so it doesn't matter +// Or better yet, use a less hacky way of putting the cluster into recovery. +// Perhaps create a backup label file in the data directory when it's restored. +fn force_crash_recovery(datadir: &Path) -> Result<()> { + // Read in the control file + let controlfilepath = datadir.to_path_buf().join("global").join("pg_control"); + let mut controlfile = + postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?; + + controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION; + + fs::write( + controlfilepath.as_path(), + postgres_ffi::encode_pg_control(controlfile), + )?; Ok(()) } // check that config file is present -pub fn load_config() -> Result { - // home - let home_dir = get_home()?; - - // check file exists - let cfg_path = home_dir.join(".zenith"); - if !cfg_path.exists() { - let err_msg = format!( +pub fn load_config(repopath: &Path) -> Result { + if !repopath.exists() { + anyhow::bail!( "Zenith config is not found in {}. You need to run 'zenith init' first", - cfg_path.to_str().unwrap() + repopath.to_str().unwrap() ); - return Err(err_msg.into()); } // load and parse file - let config = fs::read_to_string(cfg_path)?; + let config = fs::read_to_string(repopath.join("config"))?; toml::from_str(config.as_str()).map_err(|e| e.into()) } // local env for tests -pub fn test_env() -> LocalEnv { - let data_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_check"); - fs::create_dir_all(data_dir.clone()).unwrap(); - LocalEnv { - data_dir, +pub fn test_env(testname: &str) -> LocalEnv { + fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check"); + + let repo_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../tmp_check/") + .join(testname); + + // Remove remnants of old test repo + let _ = fs::remove_dir_all(&repo_path); + + let mut local_env = LocalEnv { + repo_path, pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"), zenith_distrib_dir: cargo_bin_dir(), - } + systemid: 0, + }; + init_repo(&mut local_env).expect("could not initialize zenith repository"); + return local_env; } // Find the directory where the binaries were put (i.e. target/debug/) @@ -185,3 +268,122 @@ pub fn cargo_bin_dir() -> PathBuf { return pathbuf; } + +#[derive(Debug, Clone, Copy)] +pub struct PointInTime { + pub timelineid: ZTimelineId, + pub lsn: u64, +} + +fn create_timeline(local_env: &LocalEnv, ancestor: Option) -> Result { + let repopath = &local_env.repo_path; + + // Create initial timeline + let mut tli_buf = [0u8; 16]; + rand::thread_rng().fill(&mut tli_buf); + let timelineid = ZTimelineId::from(tli_buf); + + let timelinedir = repopath.join("timelines").join(timelineid.to_string()); + + fs::create_dir(&timelinedir)?; + fs::create_dir(&timelinedir.join("snapshots"))?; + fs::create_dir(&timelinedir.join("wal"))?; + + if let Some(ancestor) = ancestor { + let data = format!( + "{}@{:X}/{:X}", + ancestor.timelineid, + ancestor.lsn >> 32, + ancestor.lsn & 0xffffffff + ); + fs::write(timelinedir.join("ancestor"), data)?; + } + + Ok(timelineid) +} + +// Parse an LSN in the format used in filenames +// +// For example: 00000000015D3DD8 +// +fn parse_lsn(s: &str) -> std::result::Result { + u64::from_str_radix(s, 16) +} + +// Create a new branch in the repository (for the "zenith branch" subcommand) +pub fn create_branch( + local_env: &LocalEnv, + branchname: &str, + startpoint: PointInTime, +) -> Result<()> { + let repopath = &local_env.repo_path; + + // create a new timeline for it + let newtli = create_timeline(local_env, Some(startpoint))?; + let newtimelinedir = repopath.join("timelines").join(newtli.to_string()); + + let data = newtli.to_string(); + fs::write( + repopath.join("refs").join("branches").join(branchname), + data, + )?; + + // Copy the latest snapshot (TODO: before the startpoint) and all WAL + // TODO: be smarter and avoid the copying... + let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?; + let copy_opts = fs_extra::dir::CopyOptions::new(); + fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?; + + let oldtimelinedir = repopath + .join("timelines") + .join(startpoint.timelineid.to_string()); + let mut copy_opts = fs_extra::dir::CopyOptions::new(); + copy_opts.content_only = true; + fs_extra::dir::copy( + oldtimelinedir.join("wal"), + newtimelinedir.join("wal"), + ©_opts, + )?; + + Ok(()) +} + +// Find the end of valid WAL in a wal directory +pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result { + let repopath = &local_env.repo_path; + let waldir = repopath + .join("timelines") + .join(timeline.to_string()) + .join("wal"); + + let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true); + + return Ok(lsn); +} + +// Find the latest snapshot for a timeline +fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> { + let repopath = &local_env.repo_path; + + let snapshotsdir = repopath + .join("timelines") + .join(timeline.to_string()) + .join("snapshots"); + let paths = fs::read_dir(&snapshotsdir)?; + let mut maxsnapshot: u64 = 0; + let mut snapshotdir: Option = None; + for path in paths { + let path = path?; + let filename = path.file_name().to_str().unwrap().to_owned(); + if let Ok(lsn) = parse_lsn(&filename) { + maxsnapshot = std::cmp::max(lsn, maxsnapshot); + snapshotdir = Some(path.path()); + } + } + if maxsnapshot == 0 { + // TODO: check ancestor timeline + anyhow::bail!("no snapshot found in {}", snapshotsdir.display()); + } + + Ok((maxsnapshot, snapshotdir.unwrap())) +} diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3175998f9e..914cbbf578 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,10 +1,11 @@ -use std::error; +use anyhow::Result; use std::fs; use std::io; use std::net::SocketAddr; use std::net::TcpStream; use std::path::{Path, PathBuf}; use std::process::Command; +use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; @@ -12,9 +13,9 @@ use std::time::Duration; use postgres::{Client, NoTls}; -use crate::local_env::{self, LocalEnv}; - -type Result = std::result::Result>; +use crate::compute::PostgresNode; +use crate::local_env::LocalEnv; +use pageserver::ZTimelineId; // // Collection of several example deployments useful for tests. @@ -26,63 +27,70 @@ pub struct TestStorageControlPlane { pub wal_acceptors: Vec, pub pageserver: Arc, pub test_done: AtomicBool, + pub repopath: PathBuf, } impl TestStorageControlPlane { + // Peek into the repository, to grab the timeline ID of given branch + pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId { + let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() + } + // postgres <-> page_server - pub fn one_page_server(pgdata_base_path: String) -> TestStorageControlPlane { - let env = local_env::test_env(); + // + // Initialize a new repository and configure a page server to run in it + // + pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); - - if pgdata_base_path.is_empty() { - pserver.start().unwrap(); - } else { - pserver.start_fromdatadir(pgdata_base_path).unwrap(); - } + pserver.start().unwrap(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } - pub fn one_page_server_no_start() -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); let pserver = Arc::new(PageServerNode { - env, + env: local_env.clone(), kill_on_exit: true, listen_address: None, }); - pserver.init(); TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: pserver, test_done: AtomicBool::new(false), + repopath: repopath, } } // postgres <-> {wal_acceptor1, wal_acceptor2, ...} - pub fn fault_tolerant(redundancy: usize) -> TestStorageControlPlane { - let env = local_env::test_env(); + pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane { + let repopath = local_env.repo_path.clone(); + let mut cplane = TestStorageControlPlane { wal_acceptors: Vec::new(), pageserver: Arc::new(PageServerNode { - env: env.clone(), + env: local_env.clone(), kill_on_exit: true, listen_address: None, }), test_done: AtomicBool::new(false), + repopath: repopath, }; - cplane.pageserver.init(); cplane.pageserver.start().unwrap(); const WAL_ACCEPTOR_PORT: usize = 54321; @@ -92,8 +100,8 @@ impl TestStorageControlPlane { listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i) .parse() .unwrap(), - data_dir: env.data_dir.join(format!("wal_acceptor_{}", i)), - env: env.clone(), + data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)), + env: local_env.clone(), }; wal_acceptor.init(); wal_acceptor.start(); @@ -155,58 +163,53 @@ impl PageServerNode { } } - pub fn init(&self) { - fs::create_dir_all(self.env.pageserver_data_dir()).unwrap(); + pub fn repo_path(&self) -> PathBuf { + self.env.repo_path.clone() + } + + pub fn pid_file(&self) -> PathBuf { + self.env.repo_path.join("pageserver.pid") } pub fn start(&self) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); + println!( + "Starting pageserver at '{}' in {}", + self.address(), + self.repo_path().display() + ); - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) + let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver")); + cmd.args(&["-l", self.address().to_string().as_str()]) .arg("-d") .env_clear() + .env("RUST_BACKTRACE", "1") + .env("ZENITH_REPO_DIR", self.repo_path()) .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .status()?; + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); - if !status.success() { - return Err(Box::::from(format!( + if !cmd.status()?.success() { + anyhow::bail!( "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + self.repo_path().join("pageserver.log").display() + ); } - } - pub fn start_fromdatadir(&self, pgdata_base_path: String) -> Result<()> { - println!("Starting pageserver at '{}'", self.address()); - - let status = Command::new(self.env.zenith_distrib_dir.join("pageserver")) // XXX -> method - .args(&["-D", self.env.pageserver_data_dir().to_str().unwrap()]) - .args(&["-l", self.address().to_string().as_str()]) - .arg("-d") - .args(&["--restore-from", "local"]) - .env_clear() - .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("PGDATA_BASE_PATH", pgdata_base_path) - .status()?; - - if !status.success() { - return Err(Box::::from(format!( - "Pageserver failed to start. See '{}' for details.", - self.env.pageserver_log().to_str().unwrap() - ))); - } else { - return Ok(()); + // It takes a while for the page server to start up. Wait until it is + // open for business. + for retries in 1..15 { + let client = self.page_server_psql_client(); + if client.is_ok() { + break; + } else { + println!("page server not responding yet, retrying ({})...", retries); + thread::sleep(Duration::from_secs(1)); + } } + Ok(()) } pub fn stop(&self) -> Result<()> { - let pidfile = self.env.pageserver_pidfile(); + let pidfile = self.pid_file(); let pid = read_pidfile(&pidfile)?; let status = Command::new("kill") @@ -216,10 +219,7 @@ impl PageServerNode { .expect("failed to execute kill"); if !status.success() { - return Err(Box::::from(format!( - "Failed to kill pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to kill pageserver with pid {}", pid); } // await for pageserver stop @@ -234,10 +234,7 @@ impl PageServerNode { // ok, we failed to stop pageserver, let's panic if !status.success() { - return Err(Box::::from(format!( - "Failed to stop pageserver with pid {}", - pid - ))); + anyhow::bail!("Failed to stop pageserver with pid {}", pid); } else { return Ok(()); } @@ -256,6 +253,19 @@ impl PageServerNode { println!("Pageserver query: '{}'", sql); client.simple_query(sql).unwrap() } + + pub fn page_server_psql_client( + &self, + ) -> std::result::Result { + let connstring = format!( + "host={} port={} dbname={} user={}", + self.address().ip(), + self.address().port(), + "no_db", + "no_user", + ); + Client::connect(connstring.as_str(), NoTls) + } } impl Drop for PageServerNode { @@ -295,6 +305,12 @@ impl WalAcceptorNode { let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor")) .args(&["-D", self.data_dir.to_str().unwrap()]) .args(&["-l", self.listen.to_string().as_str()]) + .args(&["--systemid", &self.env.systemid.to_string()]) + // Tell page server it can receive WAL from this WAL safekeeper + // FIXME: If there are multiple safekeepers, they will all inform + // the page server. Only the last "notification" will stay in effect. + // So it's pretty random which safekeeper the page server will connect to + .args(&["--pageserver", "127.0.0.1:64000"]) .arg("-d") .arg("-n") .status() diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml index b201b1849e..51f9d0c773 100644 --- a/integration_tests/Cargo.toml +++ b/integration_tests/Cargo.toml @@ -12,4 +12,6 @@ rand = "0.8.3" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } diff --git a/integration_tests/tests/test_compute.rs b/integration_tests/tests/test_compute.rs index 955b7ffa5e..f4cf38432e 100644 --- a/integration_tests/tests/test_compute.rs +++ b/integration_tests/tests/test_compute.rs @@ -1,7 +1,11 @@ // test node resettlement to an empty datadir + +// TODO +/* #[test] fn test_resettlement() {} // test seq scan of everythin after restart #[test] fn test_cold_seqscan() {} +*/ diff --git a/integration_tests/tests/test_control_plane.rs b/integration_tests/tests/test_control_plane.rs index 481cd3d8b3..8724d5fda1 100644 --- a/integration_tests/tests/test_control_plane.rs +++ b/integration_tests/tests/test_control_plane.rs @@ -1,5 +1,8 @@ +// TODO +/* #[test] fn test_actions() {} #[test] fn test_regress() {} +*/ diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 8af066ae90..67df31ef65 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -1,23 +1,24 @@ // mod control_plane; use control_plane::compute::ComputeControlPlane; +use control_plane::local_env; +use control_plane::local_env::PointInTime; use control_plane::storage::TestStorageControlPlane; -use std::thread::sleep; -use std::time::Duration; - // XXX: force all redo at the end // -- restart + seqscan won't read deleted stuff // -- pageserver api endpoint to check all rels - -// Handcrafted cases with wal records that are (were) problematic for redo. +/* #[test] fn test_redo_cases() { + let local_env = local_env::test_env("test_redo_cases"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); // check basic work with table @@ -47,16 +48,19 @@ fn test_redo_cases() { println!("sum = {}", count); assert_eq!(count, 5000050000); } - +*/ // Runs pg_regress on a compute node #[test] fn test_regress() { + let local_env = local_env::test_env("test_regress"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); node.pg_regress(); @@ -76,16 +80,33 @@ fn pgbench() { node.pg_bench(10, 100); } -// Run two postgres instances on one pageserver +// Run two postgres instances on one pageserver, on different timelines #[test] -fn test_pageserver_multitenancy() { - // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); +fn test_pageserver_two_timelines() { + let local_env = local_env::test_env("test_pageserver_two_timelines"); - // Allocate postgres instance, but don't start - let node1 = compute_cplane.new_test_node(); - let node2 = compute_cplane.new_test_node(); + // Start pageserver that reads WAL directly from that postgres + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); + + let maintli = storage_cplane.get_branch_timeline("main"); + + // Create new branch at the end of 'main' + let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); + local_env::create_branch( + &local_env, + "experimental", + PointInTime { + timelineid: maintli, + lsn: startpoint, + }, + ) + .unwrap(); + let experimentaltli = storage_cplane.get_branch_timeline("experimental"); + + // Launch postgres instances on both branches + let node1 = compute_cplane.new_test_node(maintli); + let node2 = compute_cplane.new_test_node(experimentaltli); node1.start().unwrap(); node2.start().unwrap(); @@ -123,36 +144,3 @@ fn test_pageserver_multitenancy() { println!("sum = {}", count); assert_eq!(count, 15000150000); } - -#[test] -fn test_upload_pageserver_local() { - // Init pageserver that reads WAL directly from that postgres - // Don't start yet - - let storage_cplane = TestStorageControlPlane::one_page_server_no_start(); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); - - // init postgres node - let node = compute_cplane.new_test_node(); - - //upload data to pageserver & start it - &storage_cplane - .pageserver - .start_fromdatadir(node.pgdata().to_str().unwrap().to_string()) - .unwrap(); - - sleep(Duration::from_secs(10)); - - // start postgres node - node.start().unwrap(); - - // check basic work with table - node.safe_psql( - "postgres", - "CREATE TABLE t(key int primary key, value text)", - ); - node.safe_psql( - "postgres", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ); -} diff --git a/integration_tests/tests/test_wal_acceptor.rs b/integration_tests/tests/test_wal_acceptor.rs index f4f7675b07..939648b2ea 100644 --- a/integration_tests/tests/test_wal_acceptor.rs +++ b/integration_tests/tests/test_wal_acceptor.rs @@ -1,6 +1,9 @@ // Restart acceptors one by one while compute is under the load. use control_plane::compute::ComputeControlPlane; +use control_plane::local_env; +use control_plane::local_env::PointInTime; use control_plane::storage::TestStorageControlPlane; +use pageserver::ZTimelineId; use rand::Rng; use std::sync::Arc; @@ -9,18 +12,20 @@ use std::{thread, time}; #[test] fn test_acceptors_normal_work() { - // Start pageserver that reads WAL directly from that postgres + let local_env = local_env::test_env("test_acceptors_normal_work"); + const REDUNDANCY: usize = 3; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( @@ -41,71 +46,97 @@ fn test_acceptors_normal_work() { // check wal files equality } +// Run page server and multiple safekeepers, and multiple compute nodes running +// against different timelines. #[test] -fn test_multitenancy() { - // Start pageserver that reads WAL directly from that postgres +fn test_many_timelines() { + // Initialize a new repository, and set up WAL safekeepers and page server. const REDUNDANCY: usize = 3; - const N_NODES: usize = 5; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + const N_TIMELINES: usize = 5; + let local_env = local_env::test_env("test_many_timelines"); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); - // start postgres - let mut nodes = Vec::new(); - let mut proxies = Vec::new(); - for _ in 0..N_NODES { - let node = compute_cplane.new_test_master_node(); - nodes.push(node); - nodes.last().unwrap().start().unwrap(); - proxies.push(nodes.last().unwrap().start_proxy(wal_acceptors.clone())); - } + // Create branches + let mut timelines: Vec = Vec::new(); + let maintli = storage_cplane.get_branch_timeline("main"); // main branch + timelines.push(maintli); + let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap(); + for i in 1..N_TIMELINES { + // additional branches + let branchname = format!("experimental{}", i); + local_env::create_branch( + &local_env, + &branchname, + PointInTime { + timelineid: maintli, + lsn: startpoint, + }, + ) + .unwrap(); + let tli = storage_cplane.get_branch_timeline(&branchname); + timelines.push(tli); + } + + // start postgres on each timeline + let mut nodes = Vec::new(); + for tli in timelines { + let node = compute_cplane.new_test_node(tli); + nodes.push(node.clone()); + node.start().unwrap(); + node.start_proxy(&wal_acceptors); + } // create schema - for node in &nodes { - node.safe_psql( - "postgres", - "CREATE TABLE t(key int primary key, value text)", - ); - } + for node in &nodes { + node.safe_psql( + "postgres", + "CREATE TABLE t(key int primary key, value text)", + ); + } - // Populate data - for node in &nodes { - node.safe_psql( - "postgres", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ); - } + // Populate data + for node in &nodes { + node.safe_psql( + "postgres", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ); + } - // Check data - for node in &nodes { - let count: i64 = node - .safe_psql("postgres", "SELECT sum(key) FROM t") - .first() - .unwrap() - .get(0); - println!("sum = {}", count); - assert_eq!(count, 5000050000); - } + // Check data + for node in &nodes { + let count: i64 = node + .safe_psql("postgres", "SELECT sum(key) FROM t") + .first() + .unwrap() + .get(0); + println!("sum = {}", count); + assert_eq!(count, 5000050000); + } } // Majority is always alive #[test] fn test_acceptors_restarts() { + let local_env = local_env::test_env("test_acceptors_restarts"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; const FAULT_PROBABILITY: f32 = 0.01; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); let mut rng = rand::thread_rng(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); let mut failed_node: Option = None; // check basic work with table @@ -150,20 +181,23 @@ fn start_acceptor(cplane: &Arc, no: usize) { // them again and check that nothing was losed. Repeat. // N_CRASHES env var #[test] -fn test_acceptors_unavalability() { +fn test_acceptors_unavailability() { + let local_env = local_env::test_env("test_acceptors_unavailability"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 2; - let storage_cplane = TestStorageControlPlane::fault_tolerant(REDUNDANCY); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( @@ -226,19 +260,24 @@ fn simulate_failures(cplane: Arc) { // Race condition test #[test] fn test_race_conditions() { + let local_env = local_env::test_env("test_race_conditions"); + // Start pageserver that reads WAL directly from that postgres const REDUNDANCY: usize = 3; - let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(REDUNDANCY)); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant( + &local_env, REDUNDANCY, + )); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info(); // start postgres - let node = compute_cplane.new_test_master_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_master_node(maintli); node.start().unwrap(); // start proxy - let _proxy = node.start_proxy(wal_acceptors); + let _proxy = node.start_proxy(&wal_acceptors); // check basic work with table node.safe_psql( diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d5b3481073..41e0a548fb 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -14,6 +14,7 @@ regex = "1.4.5" bytes = "1.0.1" byteorder = "1.4.3" fs2 = "0.4.3" +fs_extra = "1.2.0" futures = "0.3.13" lazy_static = "1.4.0" slog-stdlog = "4.1.0" @@ -38,3 +39,7 @@ anyhow = "1.0" crc32c = "0.6.0" walkdir = "2" thiserror = "1.0" +hex = "0.4.3" +tar = "0.4.33" + +postgres_ffi = { path = "../postgres_ffi" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs new file mode 100644 index 0000000000..d8ed5183a8 --- /dev/null +++ b/pageserver/src/basebackup.rs @@ -0,0 +1,202 @@ +use log::*; +use regex::Regex; +use std::fmt; +use std::io::Write; +use tar::Builder; +use walkdir::WalkDir; + +use crate::ZTimelineId; + +pub fn send_snapshot_tarball( + write: &mut dyn Write, + timelineid: ZTimelineId, + snapshotlsn: u64, +) -> Result<(), std::io::Error> { + let mut ar = Builder::new(write); + + let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn); + let walpath = format!("timelines/{}/wal", timelineid); + + debug!("sending tarball of snapshot in {}", snappath); + //ar.append_dir_all("", &snappath)?; + + for entry in WalkDir::new(&snappath) { + let entry = entry?; + let fullpath = entry.path(); + let relpath = entry.path().strip_prefix(&snappath).unwrap(); + + if relpath.to_str().unwrap() == "" { + continue; + } + + if entry.file_type().is_dir() { + trace!( + "sending dir {} as {}", + fullpath.display(), + relpath.display() + ); + ar.append_dir(relpath, fullpath)?; + } else if entry.file_type().is_symlink() { + error!("ignoring symlink in snapshot dir"); + } else if entry.file_type().is_file() { + // Shared catalogs are exempt + if relpath.starts_with("global/") { + trace!("sending shared catalog {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else if !is_rel_file_path(relpath.to_str().unwrap()) { + trace!("sending {}", relpath.display()); + ar.append_path_with_name(fullpath, relpath)?; + } else { + trace!("not sending {}", relpath.display()); + // FIXME: send all files for now + ar.append_path_with_name(fullpath, relpath)?; + } + } else { + error!("unknown file type: {}", fullpath.display()); + } + } + + // FIXME: also send all the WAL + for entry in std::fs::read_dir(&walpath)? { + let entry = entry?; + let fullpath = &entry.path(); + let relpath = fullpath.strip_prefix(&walpath).unwrap(); + + if !entry.path().is_file() { + continue; + } + + let archive_fname = relpath.to_str().unwrap().clone(); + let archive_fname = archive_fname + .strip_suffix(".partial") + .unwrap_or(&archive_fname); + let archive_path = "pg_wal/".to_owned() + archive_fname; + ar.append_path_with_name(fullpath, archive_path)?; + } + + ar.finish()?; + debug!("all tarred up!"); + Ok(()) +} + +// formats: +// +// _ +// . +// _. + +#[derive(Debug)] +struct FilePathError { + msg: String, +} + +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} + +fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> { + /* + * Relation data files can be in one of the following directories: + * + * global/ + * shared relations + * + * base// + * regular relations, default tablespace + * + * pg_tblspc/// + * within a non-default tablespace (the name of the directory + * depends on version) + * + * And the relation data files themselves have a filename like: + * + * . + */ + if let Some(fname) = path.strip_prefix("global/") { + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(dbpath) = path.strip_prefix("base/") { + let mut s = dbpath.split("/"); + let dbnode_str = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + let _dbnode = u32::from_str_radix(dbnode_str, 10)?; + let fname = s + .next() + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + if s.next().is_some() { + return Err(FilePathError::new("invalid relation data file name")); + }; + + let (_relnode, _forknum, _segno) = parse_filename(fname)?; + + return Ok(()); + } else if let Some(_) = path.strip_prefix("pg_tblspc/") { + // TODO + return Err(FilePathError::new("tablespaces not supported")); + } else { + return Err(FilePathError::new("invalid relation data file name")); + } +} + +fn is_rel_file_path(path: &str) -> bool { + return parse_rel_file_path(path).is_ok(); +} diff --git a/pageserver/src/bin/cli/main.rs b/pageserver/src/bin/cli/main.rs deleted file mode 100644 index 4aa3269c09..0000000000 --- a/pageserver/src/bin/cli/main.rs +++ /dev/null @@ -1,43 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -pub mod pg; -pub mod snapshot; -pub mod storage; -mod subcommand; - -fn main() -> Result<()> { - let cli_commands = subcommand::ClapCommands { - commands: vec![ - Box::new(pg::PgCmd { - clap_cmd: clap::SubCommand::with_name("pg"), - }), - Box::new(storage::StorageCmd { - clap_cmd: clap::SubCommand::with_name("storage"), - }), - Box::new(snapshot::SnapshotCmd { - clap_cmd: clap::SubCommand::with_name("snapshot"), - }), - ], - }; - - let matches = App::new("zenith") - .about("Zenith CLI") - .version("1.0") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommands(cli_commands.generate()) - .get_matches(); - - if let Some(subcommand) = matches.subcommand_name() { - println!("'git {}' was used", subcommand); - } - - match matches.subcommand() { - ("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?, - ("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?, - ("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?, - ("", None) => println!("No subcommand"), - _ => unreachable!(), - } - Ok(()) -} diff --git a/pageserver/src/bin/cli/pg.rs b/pageserver/src/bin/cli/pg.rs deleted file mode 100644 index 7fe2f86d6c..0000000000 --- a/pageserver/src/bin/cli/pg.rs +++ /dev/null @@ -1,105 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct PgCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for PgCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith compute nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list").about("List existing compute nodes")) - .subcommand( - App::new("create") - .about( - "Create (init) new data directory using given storage and start postgres", - ) - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("storage") - .short("s") - .long("storage") - .takes_value(true) - .help("Name of the storage node to use"), - ) - //TODO should it be just name of uploaded snapshot or some path? - .arg( - Arg::with_name("snapshot") - .long("snapshot") - .takes_value(true) - .help("Name of the snapshot to use"), - ) - .arg( - Arg::with_name("nostart") - .long("no-start") - .takes_value(false) - .help("Don't start postgres on the created node"), - ), - ) - .subcommand( - App::new("destroy") - .about("Stop postgres and destroy node's data directory") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("start") - .about("Start postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ) - .arg( - Arg::with_name("replica") - .long("replica") - .takes_value(false) - .help("Start the compute node as replica"), - ), - ) - .subcommand( - App::new("stop") - .about("Stop postgres on the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - .subcommand( - App::new("show") - .about("Show info about the given node") - .arg( - Arg::with_name("name") - .short("n") - .long("name") - .takes_value(true) - .help("Name of the compute node"), - ), - ) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run PgCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/snapshot.rs b/pageserver/src/bin/cli/snapshot.rs deleted file mode 100644 index 47e608b8e2..0000000000 --- a/pageserver/src/bin/cli/snapshot.rs +++ /dev/null @@ -1,27 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings, Arg}; - -use crate::subcommand; - -pub struct SnapshotCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for SnapshotCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith snapshots") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true))) - .subcommand(App::new("destroy")) - .subcommand(App::new("start")) - .subcommand(App::new("stop")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run SnapshotCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/storage.rs b/pageserver/src/bin/cli/storage.rs deleted file mode 100644 index 71ca61e905..0000000000 --- a/pageserver/src/bin/cli/storage.rs +++ /dev/null @@ -1,25 +0,0 @@ -use anyhow::Result; -use clap::{App, AppSettings}; - -use crate::subcommand; - -pub struct StorageCmd<'a> { - pub clap_cmd: clap::App<'a, 'a>, -} - -impl subcommand::SubCommand for StorageCmd<'_> { - fn gen_clap_command(&self) -> clap::App { - let c = self.clap_cmd.clone(); - c.about("Operations with zenith storage nodes") - .setting(AppSettings::SubcommandRequiredElseHelp) - .subcommand(App::new("list")) - .subcommand(App::new("attach")) - .subcommand(App::new("detach")) - .subcommand(App::new("show")) - } - - fn run(&self, args: clap::ArgMatches) -> Result<()> { - println!("Run StorageCmd with args {:?}", args); - Ok(()) - } -} diff --git a/pageserver/src/bin/cli/subcommand.rs b/pageserver/src/bin/cli/subcommand.rs deleted file mode 100644 index 6a9e7363b9..0000000000 --- a/pageserver/src/bin/cli/subcommand.rs +++ /dev/null @@ -1,29 +0,0 @@ -use anyhow::Result; - -/// All subcommands need to implement this interface. -pub trait SubCommand { - /// Generates the cli-config that Clap requires for the subcommand. - fn gen_clap_command(&self) -> clap::App; - - /// Runs the body of the subcommand. - fn run(&self, args: clap::ArgMatches) -> Result<()>; -} - -/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must -/// implement the `SubCommand` trait, but other than that, can be of any type. -pub struct ClapCommands { - pub commands: Vec>, -} - -impl ClapCommands { - /// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in - /// order to generate the full CLI. - pub fn generate(&self) -> Vec { - let mut v: Vec = Vec::new(); - - for command in self.commands.iter() { - v.push(command.gen_clap_command()); - } - v - } -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2ba51e83a2..12db5180af 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,6 +4,7 @@ use log::*; use std::fs; +use std::fs::{File, OpenOptions}; use std::io; use std::path::PathBuf; use std::process::exit; @@ -17,59 +18,50 @@ use daemonize::Daemonize; use slog::Drain; use pageserver::page_service; -use pageserver::restore_datadir; -use pageserver::restore_s3; use pageserver::tui; -use pageserver::walreceiver; +//use pageserver::walreceiver; use pageserver::PageServerConf; +fn zenith_repo_dir() -> String { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => String::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") - .arg(Arg::with_name("datadir") - .short("D") - .long("dir") - .takes_value(true) - .help("Path to the page server data directory")) - .arg(Arg::with_name("wal_producer") - .short("w") - .long("wal-producer") - .takes_value(true) - .help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')")) - .arg(Arg::with_name("listen") - .short("l") - .long("listen") - .takes_value(true) - .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)")) - .arg(Arg::with_name("interactive") - .short("i") - .long("interactive") - .takes_value(false) - .help("Interactive mode")) - .arg(Arg::with_name("daemonize") - .short("d") - .long("daemonize") - .takes_value(false) - .help("Run in the background")) - .arg(Arg::with_name("restore_from") - .long("restore-from") - .takes_value(true) - .help("Upload data from s3 or datadir")) + .arg( + Arg::with_name("listen") + .short("l") + .long("listen") + .takes_value(true) + .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"), + ) + .arg( + Arg::with_name("interactive") + .short("i") + .long("interactive") + .takes_value(false) + .help("Interactive mode"), + ) + .arg( + Arg::with_name("daemonize") + .short("d") + .long("daemonize") + .takes_value(false) + .help("Run in the background"), + ) .get_matches(); let mut conf = PageServerConf { - data_dir: PathBuf::from("./"), daemonize: false, interactive: false, - wal_producer_connstr: None, listen_addr: "127.0.0.1:5430".parse().unwrap(), - restore_from: String::new(), }; - if let Some(dir) = arg_matches.value_of("datadir") { - conf.data_dir = PathBuf::from(dir); - } - if arg_matches.is_present("daemonize") { conf.daemonize = true; } @@ -83,14 +75,6 @@ fn main() -> Result<()> { exit(1); } - if let Some(restore_from) = arg_matches.value_of("restore_from") { - conf.restore_from = String::from(restore_from); - } - - if let Some(addr) = arg_matches.value_of("wal_producer") { - conf.wal_producer_connstr = Some(String::from(addr)); - } - if let Some(addr) = arg_matches.value_of("listen") { conf.listen_addr = addr.parse()?; } @@ -125,19 +109,25 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { if conf.daemonize { info!("daemonizing..."); - // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so + let repodir = PathBuf::from(zenith_repo_dir()); + + // There should'n be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fprintf's or backtraces. - let log_filename = conf.data_dir.join("pageserver.log"); + let log_filename = repodir.join("pageserver.log"); let stdout = OpenOptions::new() .create(true) .append(true) .open(&log_filename) - .with_context(|| format!("failed to open {:?}", log_filename))?; - let stderr = stdout.try_clone()?; + .with_context(|| format!("failed to open {:?}", &log_filename))?; + let stderr = OpenOptions::new() + .create(true) + .append(true) + .open(&log_filename) + .with_context(|| format!("failed to open {:?}", &log_filename))?; let daemonize = Daemonize::new() - .pid_file(conf.data_dir.join("pageserver.pid")) - .working_directory(conf.data_dir.clone()) + .pid_file(repodir.clone().join("pageserver.pid")) + .working_directory(repodir) .stdout(stdout) .stderr(stderr); @@ -145,25 +135,20 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { Ok(_) => info!("Success, daemonized"), Err(e) => error!("Error, {}", e), } + } else { + // change into the repository directory. In daemon mode, Daemonize + // does this for us. + let repodir = zenith_repo_dir(); + std::env::set_current_dir(&repodir)?; + info!("Changed current directory to repository in {}", &repodir); } let mut threads = Vec::new(); - info!("starting... {}", conf.restore_from); - - // Before opening up for connections, restore the latest base backup from S3. - // (We don't persist anything to local disk at the moment, so we need to do - // this at every startup) - if conf.restore_from.eq("s3") { - info!("restore-from s3..."); - restore_s3::restore_main(&conf); - } else if conf.restore_from.eq("local") { - info!("restore-from local..."); - restore_datadir::restore_main(&conf); - } + // TODO: Check that it looks like a valid repository before going further // Create directory for wal-redo datadirs - match fs::create_dir(conf.data_dir.join("wal-redo")) { + match fs::create_dir("wal-redo") { Ok(_) => {} Err(e) => match e.kind() { io::ErrorKind::AlreadyExists => {} @@ -173,25 +158,6 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { }, } - // Launch the WAL receiver thread if pageserver was started with --wal-producer - // option. It will try to connect to the WAL safekeeper, and stream the WAL. If - // the connection is lost, it will reconnect on its own. We just fire and forget - // it here. - // - // All other wal receivers are started on demand by "callmemaybe" command - // sent to pageserver. - if let Some(wal_producer) = &conf.wal_producer_connstr { - let conf_copy = conf.clone(); - let wal_producer = wal_producer.clone(); - let walreceiver_thread = thread::Builder::new() - .name("static WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &wal_producer); - }) - .unwrap(); - threads.push(walreceiver_thread); - } - // GetPage@LSN requests are served by another thread. (It uses async I/O, // but the code in page_service sets up it own thread pool for that) let conf_copy = conf.clone(); @@ -220,20 +186,19 @@ fn init_logging(conf: &PageServerConf) -> Result, pub listen_addr: SocketAddr, - pub restore_from: String, +} + +// Zenith Timeline ID is a 32-byte random ID. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct ZTimelineId([u8; 16]); + +impl FromStr for ZTimelineId { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + let timelineid = hex::decode(s)?; + + let mut buf: [u8; 16] = [0u8; 16]; + buf.copy_from_slice(timelineid.as_slice()); + Ok(ZTimelineId(buf)) + } +} + +impl ZTimelineId { + pub fn from(b: [u8; 16]) -> ZTimelineId { + ZTimelineId(b) + } + + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId { + let mut arr = [0u8; 16]; + buf.copy_to_slice(&mut arr); + ZTimelineId::from(arr) + } + + pub fn as_arr(&self) -> [u8; 16] { + self.0 + } +} + +impl fmt::Display for ZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&hex::encode(self.0)) + } } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 336eba1f55..9ed0a422c5 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -6,6 +6,8 @@ // per-entry mutex. // +use crate::restore_local_repo::restore_timeline; +use crate::ZTimelineId; use crate::{walredo, PageServerConf}; use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -99,34 +101,57 @@ struct PageCacheShared { } lazy_static! { - pub static ref PAGECACHES: Mutex>> = Mutex::new(HashMap::new()); + pub static ref PAGECACHES: Mutex>> = + Mutex::new(HashMap::new()); } -pub fn get_pagecache(conf: &PageServerConf, sys_id: u64) -> Arc { +// Get Page Cache for given timeline. It is assumed to already exist. +pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option> { + let pcaches = PAGECACHES.lock().unwrap(); + + match pcaches.get(&timelineid) { + Some(pcache) => Some(pcache.clone()), + None => None, + } +} + +pub fn get_or_restore_pagecache( + conf: &PageServerConf, + timelineid: ZTimelineId, +) -> anyhow::Result> { let mut pcaches = PAGECACHES.lock().unwrap(); - if !pcaches.contains_key(&sys_id) { - pcaches.insert(sys_id, Arc::new(init_page_cache(&conf, sys_id))); + match pcaches.get(&timelineid) { + Some(pcache) => Ok(pcache.clone()), + None => { + let pcache = init_page_cache(&conf, timelineid); - // Initialize the WAL redo thread - // - // Now join_handle is not saved any where and we won'try restart tharead - // if it is dead. We may later stop that treads after some inactivity period - // and restart them on demand. - let conf = conf.clone(); - let _walredo_thread = thread::Builder::new() - .name("WAL redo thread".into()) - .spawn(move || { - walredo::wal_redo_main(&conf, sys_id); - }) - .unwrap(); + restore_timeline(conf, &pcache, timelineid)?; + + let result = Arc::new(pcache); + + pcaches.insert(timelineid, result.clone()); + + // Initialize the WAL redo thread + // + // Now join_handle is not saved any where and we won'try restart tharead + // if it is dead. We may later stop that treads after some inactivity period + // and restart them on demand. + let conf_copy = conf.clone(); + let _walredo_thread = thread::Builder::new() + .name("WAL redo thread".into()) + .spawn(move || { + walredo::wal_redo_main(&conf_copy, timelineid); + }) + .unwrap(); + + return Ok(result); + } } - - pcaches.get(&sys_id).unwrap().clone() } -fn open_rocksdb(conf: &PageServerConf, sys_id: u64) -> DB { - let path = conf.data_dir.join(sys_id.to_string()); +fn open_rocksdb(conf: &PageServerConf, timelineid: u64) -> DB { + let path = conf.data_dir.join(timelineid.to_string()); let mut opts = Options::default(); opts.create_if_missing(true); opts.set_use_fsync(true); @@ -134,12 +159,12 @@ fn open_rocksdb(conf: &PageServerConf, sys_id: u64) -> DB { DB::open(&opts, &path).unwrap() } -fn init_page_cache(conf: &PageServerConf, sys_id: u64) -> PageCache { +fn init_page_cache(conf: &PageServerConf, timelineid: u64) -> PageCache { // Initialize the channel between the page cache and the WAL applicator let (s, r) = unbounded(); PageCache { - db: open_rocksdb(&conf, sys_id), + db: open_rocksdb(&conf, timelineid), shared: Mutex::new(PageCacheShared { first_valid_lsn: 0, last_valid_lsn: 0, @@ -520,7 +545,8 @@ impl PageCache { // Adds a WAL record to the page cache // pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) { - let key = CacheKey { tag, lsn: rec.lsn }; + let lsn = rec.lsn; + let key = CacheKey { tag, lsn }; let content = CacheEntryContent { page_image: None, @@ -533,8 +559,8 @@ impl PageCache { let mut val_buf = BytesMut::new(); content.pack(&mut val_buf); - trace!("put_wal_record lsn: {}", key.lsn); let _res = self.db.put(&key_buf[..], &val_buf[..]); + //trace!("put_wal_record lsn: {}", lsn); self.num_entries.fetch_add(1, Ordering::Relaxed); self.num_wal_records.fetch_add(1, Ordering::Relaxed); @@ -599,17 +625,19 @@ impl PageCache { let mut shared = self.shared.lock().unwrap(); // Can't move backwards. - //assert!(lsn >= shared.last_valid_lsn); - if lsn > shared.last_valid_lsn { + let oldlsn = shared.last_valid_lsn; + if lsn >= oldlsn { shared.last_valid_lsn = lsn; self.valid_lsn_condvar.notify_all(); self.last_valid_lsn.store(lsn, Ordering::Relaxed); } else { - trace!( - "lsn={}, shared.last_valid_lsn={}", - lsn, - shared.last_valid_lsn + warn!( + "attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})", + oldlsn >> 32, + oldlsn & 0xffffffff, + lsn >> 32, + lsn & 0xffffffff ); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2335cc3bce..1ab7ee4eb4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,29 +7,43 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url +// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use byteorder::{BigEndian, ByteOrder}; -use bytes::{Buf, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; use log::*; +use regex::Regex; use std::io; +use std::str::FromStr; +use std::sync::Arc; use std::thread; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter}; use tokio::net::{TcpListener, TcpStream}; use tokio::runtime; +use tokio::runtime::Runtime; +use tokio::sync::mpsc; use tokio::task; +use crate::basebackup; use crate::page_cache; +use crate::restore_local_repo; use crate::walreceiver; use crate::PageServerConf; +use crate::ZTimelineId; type Result = std::result::Result; #[derive(Debug)] enum FeMessage { StartupMessage(FeStartupMessage), - Query(FeQueryMessage), + Query(FeQueryMessage), // Simple query + Parse(FeParseMessage), // Extended query protocol + Describe(FeDescribeMessage), + Bind(FeBindMessage), + Execute(FeExecuteMessage), + Close(FeCloseMessage), + Sync, Terminate, // @@ -45,6 +59,11 @@ enum BeMessage { AuthenticationOk, ReadyForQuery, RowDescription, + ParseComplete, + ParameterDescription, + NoData, + BindComplete, + CloseComplete, DataRow, CommandComplete, ControlFile, @@ -141,6 +160,176 @@ struct FeQueryMessage { body: Bytes, } +// We only support the simple case of Parse on unnamed prepared statement and +// no params +#[derive(Debug)] +struct FeParseMessage { + query_string: Bytes, +} + +fn read_null_terminated(buf: &mut Bytes) -> Result { + let mut result = BytesMut::new(); + + loop { + if !buf.has_remaining() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "no null-terminator in string", + )); + } + + let byte = buf.get_u8(); + + if byte == 0 { + break; + } + result.put_u8(byte); + } + return Ok(result.freeze()); +} + +impl FeParseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _pstmt_name = read_null_terminated(&mut buf)?; + let query_string = read_null_terminated(&mut buf)?; + let nparams = buf.get_i16(); + + // FIXME: the rust-postgres driver uses a named prepared statement + // for copy_out(). We're not prepared to handle that correctly. For + // now, just ignore the statement name, assuming that the client never + // uses more than one prepared statement at a time. + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Parse", + )); + } + */ + + if nparams != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "query params not implemented", + )); + } + + Ok(FeMessage::Parse(FeParseMessage { query_string })) + } +} + +#[derive(Debug)] +struct FeDescribeMessage { + kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. + // we only support unnamed prepared stmt or portal +} + +impl FeDescribeMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let kind = buf.get_u8(); + let _pstmt_name = read_null_terminated(&mut buf)?; + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented in Describe", + )); + } + */ + + if kind != b'S' { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "only prepared statmement Describe is implemented", + )); + } + + Ok(FeMessage::Describe(FeDescribeMessage { kind })) + } +} + +// we only support unnamed prepared stmt or portal +#[derive(Debug)] +struct FeExecuteMessage { + /// max # of rows + maxrows: i32, +} + +impl FeExecuteMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let maxrows = buf.get_i32(); + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + if maxrows != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "row limit in Execute message not supported", + )); + } + + Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeBindMessage {} + +impl FeBindMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let portal_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_null_terminated(&mut buf)?; + + if portal_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named portals not implemented", + )); + } + + // FIXME: see FeParseMessage::parse + /* + if pstmt_name.len() != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "named prepared statements not implemented", + )); + } + */ + + Ok(FeMessage::Bind(FeBindMessage {})) + } +} + +// we only support unnamed prepared stmt and portal +#[derive(Debug)] +struct FeCloseMessage {} + +impl FeCloseMessage { + pub fn parse(body: Bytes) -> Result { + let mut buf = body.clone(); + let _kind = buf.get_u8(); + let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + + // FIXME: we do nothing with Close + + Ok(FeMessage::Close(FeCloseMessage {})) + } +} + impl FeMessage { pub fn parse(buf: &mut BytesMut) -> Result> { if buf.len() < 5 { @@ -169,10 +358,16 @@ impl FeMessage { let mut body = buf.split_to(total_len); body.advance(5); + let mut body = body.freeze(); + match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { - body: body.freeze(), - }))), + b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body: body }))), + b'P' => Ok(Some(FeParseMessage::parse(body)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), + b'B' => Ok(Some(FeBindMessage::parse(body)?)), + b'C' => Ok(Some(FeCloseMessage::parse(body)?)), + b'S' => Ok(Some(FeMessage::Sync)), b'X' => Ok(Some(FeMessage::Terminate)), b'd' => { let smgr_tag = body.get_u8(); @@ -210,24 +405,35 @@ impl FeMessage { pub fn thread_main(conf: &PageServerConf) { // Create a new thread pool // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); + // FIXME: It would be nice to keep this single-threaded for debugging purposes, + // but that currently leads to a deadlock: if a GetPage@LSN request arrives + // for an LSN that hasn't been received yet, the thread gets stuck waiting for + // the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e + // we haven't received a "callmemaybe" request yet to tell us where to get the + // WAL, we will not have a thread available to process the "callmemaybe" + // request when it does arrive. Using a thread pool alleviates the problem so + // that it doesn't happen in the tests anymore, but in principle it could still + // happen if we receive enough GetPage@LSN requests to consume all of the + // available threads. + //let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap(); + let runtime = runtime::Runtime::new().unwrap(); info!("Starting page server on {}", conf.listen_addr); - runtime.block_on(async { + let runtime_ref = Arc::new(runtime); + + runtime_ref.clone().block_on(async { let listener = TcpListener::bind(conf.listen_addr).await.unwrap(); loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); +<<<<<<< HEAD socket.set_nodelay(true).unwrap(); let mut conn_handler = Connection::new(conf.clone(), socket); +======= + let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref); +>>>>>>> main task::spawn(async move { if let Err(err) = conn_handler.run().await { @@ -244,15 +450,17 @@ struct Connection { buffer: BytesMut, init_done: bool, conf: PageServerConf, + runtime: Arc, } impl Connection { - pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection { + pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc) -> Connection { Connection { stream: BufWriter::new(socket), buffer: BytesMut::with_capacity(10 * 1024), init_done: false, conf, + runtime: Arc::clone(runtime), } } @@ -300,6 +508,33 @@ impl Connection { self.stream.write_u8(b'I').await?; } + BeMessage::ParseComplete => { + self.stream.write_u8(b'1').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::BindComplete => { + self.stream.write_u8(b'2').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::CloseComplete => { + self.stream.write_u8(b'3').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::NoData => { + self.stream.write_u8(b'n').await?; + self.stream.write_i32(4).await?; + } + + BeMessage::ParameterDescription => { + self.stream.write_u8(b't').await?; + self.stream.write_i32(6).await?; + // we don't support params, so always 0 + self.stream.write_i16(0).await?; + } + BeMessage::RowDescription => { // XXX let mut b = Bytes::from("data\0"); @@ -389,8 +624,11 @@ impl Connection { } async fn run(&mut self) -> Result<()> { + let mut unnamed_query_string = Bytes::new(); loop { - match self.read_message().await? { + let msg = self.read_message().await?; + info!("got message {:?}", msg); + match msg { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); @@ -410,7 +648,28 @@ impl Connection { } } Some(FeMessage::Query(m)) => { - self.process_query(&m).await?; + self.process_query(m.body).await?; + } + Some(FeMessage::Parse(m)) => { + unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete).await?; + } + Some(FeMessage::Describe(_)) => { + self.write_message_noflush(&BeMessage::ParameterDescription) + .await?; + self.write_message(&BeMessage::NoData).await?; + } + Some(FeMessage::Bind(_)) => { + self.write_message(&BeMessage::BindComplete).await?; + } + Some(FeMessage::Close(_)) => { + self.write_message(&BeMessage::CloseComplete).await?; + } + Some(FeMessage::Execute(_)) => { + self.process_query(unnamed_query_string.clone()).await?; + } + Some(FeMessage::Sync) => { + self.write_message(&BeMessage::ReadyForQuery).await?; } Some(FeMessage::Terminate) => { break; @@ -419,7 +678,8 @@ impl Connection { info!("connection closed"); break; } - _ => { + x => { + error!("unexpected message type : {:?}", x); return Err(io::Error::new(io::ErrorKind::Other, "unexpected message")); } } @@ -428,41 +688,62 @@ impl Connection { Ok(()) } - async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> { - trace!("got query {:?}", q.body); + async fn process_query(&mut self, query_string: Bytes) -> Result<()> { + debug!("process query {:?}", query_string); - if q.body.starts_with(b"controlfile") { + // remove null terminator, if any + let mut query_string = query_string.clone(); + if query_string.last() == Some(&0) { + query_string.truncate(query_string.len() - 1); + } + + if query_string.starts_with(b"controlfile") { self.handle_controlfile().await - } else if q.body.starts_with(b"pagestream ") { - let (_l, r) = q.body.split_at("pagestream ".len()); - let mut r = r.to_vec(); - r.pop(); - let sysid = String::from_utf8(r).unwrap().trim().to_string(); - let sysid: u64 = sysid.parse().unwrap(); // XXX + } else if query_string.starts_with(b"pagestream ") { + let (_l, r) = query_string.split_at("pagestream ".len()); + let timelineid_str = String::from_utf8(r.to_vec()).unwrap(); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - self.handle_pagerequests(sysid).await - } else if q.body.starts_with(b"callmemaybe ") { - let (_l, r) = q.body.split_at("callmemaybe ".len()); - let mut r = r.to_vec(); - r.pop(); - let connstr = String::from_utf8(r).unwrap().trim().to_string(); + self.handle_pagerequests(timelineid).await + } else if query_string.starts_with(b"basebackup ") { + let (_l, r) = query_string.split_at("basebackup ".len()); + let r = r.to_vec(); + let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end()); + info!("got basebackup command: \"{}\"", timelineid_str); + let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap(); - let conf_copy = self.conf.clone(); - let _walreceiver_thread = thread::Builder::new() - .name("WAL receiver thread".into()) - .spawn(move || { - walreceiver::thread_main(&conf_copy, &connstr); - }) - .unwrap(); - - // generic ack: - self.write_message_noflush(&BeMessage::RowDescription) - .await?; - self.write_message_noflush(&BeMessage::DataRow).await?; + // Check that the timeline exists + self.handle_basebackup_request(timelineid).await?; self.write_message_noflush(&BeMessage::CommandComplete) .await?; self.write_message(&BeMessage::ReadyForQuery).await - } else if q.body.starts_with(b"status") { + } else if query_string.starts_with(b"callmemaybe ") { + let query_str = String::from_utf8(query_string.to_vec()) + .unwrap() + .to_string(); + + // callmemaybe + let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap(); + let caps = re.captures(&query_str); + let caps = caps.unwrap(); + + let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap(); + let connstr: String = String::from(caps.get(2).unwrap().as_str()); + + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid))); + } + + walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr); + + self.write_message_noflush(&BeMessage::CommandComplete) + .await?; + self.write_message(&BeMessage::ReadyForQuery).await + } else if query_string.starts_with(b"status") { self.write_message_noflush(&BeMessage::RowDescription) .await?; self.write_message_noflush(&BeMessage::DataRow).await?; @@ -488,7 +769,16 @@ impl Connection { self.write_message(&BeMessage::ReadyForQuery).await } - async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> { + async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> { + // Check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested pagestream on timeline {} which does not exist in page server", timelineid))); + } + let pcache = pcache.unwrap(); + /* switch client to COPYBOTH */ self.stream.write_u8(b'W').await?; self.stream.write_i32(4 + 1 + 2).await?; @@ -496,15 +786,21 @@ impl Connection { self.stream.write_i16(0).await?; /* numAttributes */ self.stream.flush().await?; - let pcache = page_cache::get_pagecache(&self.conf, sysid); - loop { let message = self.read_message().await?; +<<<<<<< HEAD /* if let Some(m) = &message { trace!("query({}): {:?}", sysid, m); }; */ +======= + + if let Some(m) = &message { + info!("query({:?}): {:?}", timelineid, m); + }; + +>>>>>>> main if message.is_none() { // connection was closed return Ok(()); @@ -573,8 +869,140 @@ impl Connection { self.write_message(&msg).await? } +<<<<<<< HEAD +======= + Some(FeMessage::ZenithCreateRequest(req)) => { + let tag = page_cache::RelTag { + spcnode: req.spcnode, + dbnode: req.dbnode, + relnode: req.relnode, + forknum: req.forknum, + }; + + pcache.relsize_inc(&tag, 0); + + self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { + ok: true, + n_blocks: 0, + })) + .await? + } + Some(FeMessage::ZenithExtendRequest(req)) => { + let tag = page_cache::RelTag { + spcnode: req.spcnode, + dbnode: req.dbnode, + relnode: req.relnode, + forknum: req.forknum, + }; + + pcache.relsize_inc(&tag, req.blkno + 1); + + self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { + ok: true, + n_blocks: 0, + })) + .await? + } +>>>>>>> main _ => {} } } } + + async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> { + // check that the timeline exists + let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid); + if pcache.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("client requested basebackup on timeline {} which does not exist in page server", timelineid))); + } + + /* switch client to COPYOUT */ + let stream = &mut self.stream; + stream.write_u8(b'H').await?; + stream.write_i32(4 + 1 + 2).await?; + stream.write_u8(0).await?; /* copy_is_binary */ + stream.write_i16(0).await?; /* numAttributes */ + stream.flush().await?; + info!("sent CopyOut"); + + /* Send a tarball of the latest snapshot on the timeline */ + + // find latest snapshot + let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap(); + + // Stream it + let (s, mut r) = mpsc::channel(5); + + let f_tar = task::spawn_blocking(move || { + basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?; + Ok(()) + }); + let f_tar2 = async { + let joinres = f_tar.await; + + if joinres.is_err() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + joinres.unwrap_err(), + )); + } + return joinres.unwrap(); + }; + + let f_pump = async move { + loop { + let buf = r.recv().await; + if buf.is_none() { + break; + } + let mut buf = buf.unwrap(); + + // CopyData + stream.write_u8(b'd').await?; + stream.write_u32((4 + buf.len()) as u32).await?; + stream.write_all(&mut buf).await?; + trace!("CopyData sent for {} bytes!", buf.len()); + + // FIXME: flush isn't really required, but makes it easier + // to view in wireshark + stream.flush().await?; + } + Ok(()) + }; + + tokio::try_join!(f_tar2, f_pump)?; + + // CopyDone + self.stream.write_u8(b'c').await?; + self.stream.write_u32(4).await?; + self.stream.flush().await?; + debug!("CopyDone sent!"); + + // FIXME: I'm getting an error from the tokio copyout driver without this. + // I think it happens when the CommandComplete, CloseComplete and ReadyForQuery + // are sent in the same TCP packet as the CopyDone. I don't understand why. + thread::sleep(std::time::Duration::from_secs(1)); + + Ok(()) + } +} + +struct CopyDataSink(mpsc::Sender); + +impl std::io::Write for CopyDataSink { + fn write(&mut self, data: &[u8]) -> std::result::Result { + let buf = Bytes::copy_from_slice(data); + + if let Err(e) = self.0.blocking_send(buf) { + return Err(io::Error::new(io::ErrorKind::Other, e)); + } + + Ok(data.len()) + } + fn flush(&mut self) -> std::result::Result<(), std::io::Error> { + // no-op + Ok(()) + } } diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs deleted file mode 100644 index b16276e7ab..0000000000 --- a/pageserver/src/restore_datadir.rs +++ /dev/null @@ -1,333 +0,0 @@ -// -// Restore chunks from S3 -// -// This runs once at Page Server startup. It loads all the "base images" from -// S3 into the in-memory page cache. It also initializes the "last valid LSN" -// in the page cache to the LSN of the base image, so that when the WAL receiver -// is started, it starts streaming from that LSN. -// - -use bytes::{Buf, BytesMut}; -use log::*; -use regex::Regex; -use std::env; -use std::fmt; - -use tokio::runtime; - -use futures::future; - -use crate::{page_cache, pg_constants, PageServerConf}; -use std::fs; -use walkdir::WalkDir; - -pub fn restore_main(conf: &PageServerConf) { - // Create a new thread pool - let runtime = runtime::Runtime::new().unwrap(); - - runtime.block_on(async { - let result = restore_chunk(conf).await; - - match result { - Ok(_) => { - return; - } - Err(err) => { - error!("error: {}", err); - return; - } - } - }); -} - -async fn restore_chunk(conf: &PageServerConf) -> Result<(), FilePathError> { - let pgdata_base_path = env::var("PGDATA_BASE_PATH").unwrap(); - info!("Restoring from local dir..."); - - let sys_id: u64 = 42; - let control_lsn = 0; //TODO get it from sysid - let mut slurp_futures: Vec<_> = Vec::new(); - - for e in WalkDir::new(pgdata_base_path.clone()) { - let entry = e.unwrap(); - - if !entry.path().is_dir() { - let path = entry.path().to_str().unwrap(); - - let relpath = path - .strip_prefix(&format!("{}/", pgdata_base_path)) - .unwrap(); - info!( - "Restoring file {} relpath {}", - entry.path().display(), - relpath - ); - - let parsed = parse_rel_file_path(&relpath); - - match parsed { - Ok(mut p) => { - p.lsn = control_lsn; - - let f = slurp_base_file(conf, sys_id, path.to_string(), p); - - slurp_futures.push(f); - } - Err(e) => { - warn!("unrecognized file: {} ({})", relpath, e); - } - }; - } - } - - let pcache = page_cache::get_pagecache(conf, sys_id); - pcache.init_valid_lsn(control_lsn); - - info!("{} files to restore...", slurp_futures.len()); - - future::join_all(slurp_futures).await; - info!("restored!"); - Ok(()) -} - -#[derive(Debug)] -struct FilePathError { - msg: String, -} - -impl FilePathError { - fn new(msg: &str) -> FilePathError { - FilePathError { - msg: msg.to_string(), - } - } -} - -impl From for FilePathError { - fn from(e: core::num::ParseIntError) -> Self { - return FilePathError { - msg: format!("invalid filename: {}", e), - }; - } -} - -impl fmt::Display for FilePathError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "invalid filename") - } -} - -fn forkname_to_forknum(forkname: Option<&str>) -> Result { - match forkname { - // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(0), - Some("fsm") => Ok(1), - Some("vm") => Ok(2), - Some("init") => Ok(3), - Some(_) => Err(FilePathError::new("invalid forkname")), - } -} - -#[derive(Debug)] -struct ParsedBaseImageFileName { - pub spcnode: u32, - pub dbnode: u32, - pub relnode: u32, - pub forknum: u32, - pub segno: u32, - - pub lsn: u64, -} - -// formats: -// -// _ -// . -// _. -fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> { - let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); - - let caps = re - .captures(fname) - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - - let relnode_str = caps.name("relnode").unwrap().as_str(); - let relnode = u32::from_str_radix(relnode_str, 10)?; - - let forkname_match = caps.name("forkname"); - let forkname = if forkname_match.is_none() { - None - } else { - Some(forkname_match.unwrap().as_str()) - }; - let forknum = forkname_to_forknum(forkname)?; - - let segno_match = caps.name("segno"); - let segno = if segno_match.is_none() { - 0 - } else { - u32::from_str_radix(segno_match.unwrap().as_str(), 10)? - }; - return Ok((relnode, forknum, segno, 0)); -} - -fn parse_rel_file_path(path: &str) -> Result { - /* - * Relation data files can be in one of the following directories: - * - * global/ - * shared relations - * - * base// - * regular relations, default tablespace - * - * pg_tblspc/// - * within a non-default tablespace (the name of the directory - * depends on version) - * - * And the relation data files themselves have a filename like: - * - * . - */ - if let Some(fname) = path.strip_prefix("global/") { - if fname.contains("pg_control") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_CONTROLFILE_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(dbpath) = path.strip_prefix("base/") { - let mut s = dbpath.split("/"); - let dbnode_str = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - let dbnode = u32::from_str_radix(dbnode_str, 10)?; - let fname = s - .next() - .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; - if s.next().is_some() { - return Err(FilePathError::new("invalid relation data file name")); - }; - - if fname.contains("pg_filenode") { - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dbnode, - relnode: 0, - forknum: pg_constants::PG_FILENODEMAP_FORKNUM, - segno: 0, - lsn: 0, - }); - } - - let (relnode, forknum, segno, lsn) = parse_filename(fname)?; - - return Ok(ParsedBaseImageFileName { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode, - relnode, - forknum, - segno, - lsn, - }); - } else if let Some(fname) = path.strip_prefix("pg_xact/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_XACT_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/members/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(fname) = path.strip_prefix("pg_multixact/offsets/") { - return Ok(ParsedBaseImageFileName { - spcnode: 0, - dbnode: 0, - relnode: 0, - forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM, - segno: u32::from_str_radix(fname, 10).unwrap(), - lsn: 0, - }); - } else if let Some(_) = path.strip_prefix("pg_tblspc/") { - // TODO - return Err(FilePathError::new("tablespaces not supported")); - } else { - return Err(FilePathError::new("invalid relation data file name")); - } -} - -async fn slurp_base_file( - conf: &PageServerConf, - sys_id: u64, - file_path: String, - parsed: ParsedBaseImageFileName, -) { - info!("slurp_base_file local path {}", file_path); - - let mut data = fs::read(file_path).unwrap(); - - // pg_filenode.map has non-standard size - 512 bytes - // enlarge it to treat as a regular page - if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM { - data.resize(8192, 0); - } - - let data_bytes: &[u8] = &data; - let mut bytes = BytesMut::from(data_bytes).freeze(); - - // FIXME: use constants (BLCKSZ) - let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192); - - let pcache = page_cache::get_pagecache(conf, sys_id); - - while bytes.remaining() >= 8192 { - let tag = page_cache::BufferTag { - rel: page_cache::RelTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - }, - blknum: blknum, - }; - - pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); - - blknum += 1; - } -} diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs new file mode 100644 index 0000000000..262479a556 --- /dev/null +++ b/pageserver/src/restore_local_repo.rs @@ -0,0 +1,490 @@ +// +// Restore chunks from local Zenith repository +// +// This runs once at Page Server startup. It loads all the "snapshots" and all +// WAL from all timelines from the local zenith repository into the in-memory page +// cache. +// +// This also initializes the "last valid LSN" in the page cache to the last LSN +// seen in the WAL, so that when the WAL receiver is started, it starts +// streaming from that LSN. +// + +use log::*; +use regex::Regex; +use std::fmt; + +use std::cmp::max; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::Read; +use std::io::Seek; +use std::io::SeekFrom; +use std::path::{Path, PathBuf}; + +use anyhow::Result; +use bytes::Bytes; + +use crate::page_cache; +use crate::page_cache::BufferTag; +use crate::page_cache::PageCache; +use crate::waldecoder::WalStreamDecoder; +use crate::PageServerConf; +use crate::ZTimelineId; + +// From pg_tablespace_d.h +// +// FIXME: we'll probably need these elsewhere too, move to some common location +const DEFAULTTABLESPACE_OID: u32 = 1663; +const GLOBALTABLESPACE_OID: u32 = 1664; + +// +// Load it all into the page cache. +// +pub fn restore_timeline( + conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, +) -> Result<()> { + let timelinepath = PathBuf::from("timelines").join(timeline.to_string()); + + if !timelinepath.exists() { + anyhow::bail!("timeline {} does not exist in the page server's repository"); + } + + // Scan .zenith/timelines//snapshots + let snapshotspath = PathBuf::from("timelines") + .join(timeline.to_string()) + .join("snapshots"); + + let mut last_snapshot_lsn: u64 = 0; + + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let direntry = direntry?; + let filename = direntry.file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + + restore_snapshot(conf, pcache, timeline, &filename)?; + info!("restored snapshot at {}", filename); + } + + if last_snapshot_lsn == 0 { + error!( + "could not find valid snapshot in {}", + snapshotspath.display() + ); + // TODO return error? + } + pcache.init_valid_lsn(last_snapshot_lsn); + + restore_wal(conf, pcache, timeline, last_snapshot_lsn)?; + + Ok(()) +} + +pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result { + let snapshotspath = format!("timelines/{}/snapshots", timeline); + + let mut last_snapshot_lsn = 0; + for direntry in fs::read_dir(&snapshotspath).unwrap() { + let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned(); + + let lsn = u64::from_str_radix(&filename, 16)?; + last_snapshot_lsn = max(lsn, last_snapshot_lsn); + } + + if last_snapshot_lsn == 0 { + error!("could not find valid snapshot in {}", &snapshotspath); + // TODO return error? + } + Ok(last_snapshot_lsn) +} + +fn restore_snapshot( + conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, + snapshot: &str, +) -> Result<()> { + let snapshotpath = PathBuf::from("timelines") + .join(timeline.to_string()) + .join("snapshots") + .join(snapshot); + + // Scan 'global' + for direntry in fs::read_dir(snapshotpath.join("global"))? { + let direntry = direntry?; + match direntry.file_name().to_str() { + None => continue, + + // These special files appear in the snapshot, but are not needed by the page server + Some("pg_control") => continue, + Some("pg_filenode.map") => continue, + + // Load any relation files into the page server + _ => restore_relfile( + conf, + pcache, + timeline, + snapshot, + GLOBALTABLESPACE_OID, + 0, + &direntry.path(), + )?, + } + } + + // Scan 'base'. It contains database dirs, the database OID is the filename. + // E.g. 'base/12345', where 12345 is the database OID. + for direntry in fs::read_dir(snapshotpath.join("base"))? { + let direntry = direntry?; + + let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?; + + for direntry in fs::read_dir(direntry.path())? { + let direntry = direntry?; + match direntry.file_name().to_str() { + None => continue, + + // These special files appear in the snapshot, but are not needed by the page server + Some("PG_VERSION") => continue, + Some("pg_filenode.map") => continue, + + // Load any relation files into the page server + _ => restore_relfile( + conf, + pcache, + timeline, + snapshot, + DEFAULTTABLESPACE_OID, + dboid, + &direntry.path(), + )?, + } + } + } + + // TODO: Scan pg_tblspc + + Ok(()) +} + +fn restore_relfile( + _conf: &PageServerConf, + pcache: &PageCache, + _timeline: ZTimelineId, + snapshot: &str, + spcoid: u32, + dboid: u32, + path: &Path, +) -> Result<()> { + let lsn = u64::from_str_radix(snapshot, 16)?; + + // Does it look like a relation file? + + let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); + if p.is_err() { + let e = p.unwrap_err(); + warn!("unrecognized file in snapshot: {:?} ({})", path, e); + return Err(e)?; + } + let (relnode, forknum, segno) = p.unwrap(); + + let mut file = File::open(path)?; + let mut buf: [u8; 8192] = [0u8; 8192]; + + // FIXME: use constants (BLCKSZ) + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192); + loop { + let r = file.read_exact(&mut buf); + match r { + Ok(_) => { + let tag = page_cache::BufferTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + blknum: blknum, + }; + pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf)); + /* + if oldest_lsn == 0 || p.lsn < oldest_lsn { + oldest_lsn = p.lsn; + } + */ + } + + // TODO: UnexpectedEof is expected + Err(e) => match e.kind() { + std::io::ErrorKind::UnexpectedEof => { + // reached EOF. That's expected. + // FIXME: maybe check that we read the full length of the file? + break; + } + _ => { + error!("error reading file: {:?} ({})", path, e); + break; + } + }, + }; + blknum += 1; + } + + let tag = page_cache::RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + }; + pcache.relsize_inc(&tag, blknum); + + Ok(()) +} + +// Scan WAL on a timeline, starting from gien LSN, and load all the records +// into the page cache. +fn restore_wal( + _conf: &PageServerConf, + pcache: &PageCache, + timeline: ZTimelineId, + startpoint: u64, +) -> Result<()> { + let walpath = format!("timelines/{}/wal", timeline); + + let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint)); + + let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024); + let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024); + let mut last_lsn = 0; + loop { + // FIXME: assume postgresql tli 1 for now + let filename = XLogFileName(1, segno, 16 * 1024 * 1024); + let mut path = walpath.clone() + "/" + &filename; + + // It could be as .partial + if !PathBuf::from(&path).exists() { + path = path + ".partial"; + } + + // Slurp the WAL file + let open_result = File::open(&path); + if let Err(e) = open_result { + if e.kind() == std::io::ErrorKind::NotFound { + break; + } + return Err(e)?; + } + let mut file = open_result.unwrap(); + + if offset > 0 { + file.seek(SeekFrom::Start(offset as u64))?; + } + + let mut buf = Vec::new(); + let nread = file.read_to_end(&mut buf)?; + if nread != 16 * 1024 * 1024 - offset as usize { + // Maybe allow this for .partial files? + error!("read only {} bytes from WAL file", nread); + } + waldecoder.feed_bytes(&buf); + + let mut nrecords = 0; + loop { + let rec = waldecoder.poll_decode(); + if rec.is_err() { + // Assume that an error means we've reached the end of + // a partial WAL record. So that's ok. + break; + } + if let Some((lsn, recdata)) = rec.unwrap() { + let decoded = crate::waldecoder::decode_wal_record(recdata.clone()); + + // Put the WAL record to the page cache. We make a separate copy of + // it for every block it modifies. (The actual WAL record is kept in + // a Bytes, which uses a reference counter for the underlying buffer, + // so having multiple copies of it doesn't cost that much) + for blk in decoded.blocks.iter() { + let tag = BufferTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum as u8, + blknum: blk.blkno, + }; + + let rec = page_cache::WALRecord { + lsn: lsn, + will_init: blk.will_init || blk.apply_image, + rec: recdata.clone(), + }; + + pcache.put_wal_record(tag, rec); + } + + // Now that this record has been handled, let the page cache know that + // it is up-to-date to this LSN + pcache.advance_last_valid_lsn(lsn); + last_lsn = lsn; + } else { + break; + } + nrecords += 1; + } + + info!("restored {} records from WAL file {}", nrecords, filename); + + segno += 1; + offset = 0; + } + info!( + "reached end of WAL at {:X}/{:X}", + last_lsn >> 32, + last_lsn & 0xffffffff + ); + + Ok(()) +} + +// FIXME: copied from xlog_utils.rs +pub const XLOG_FNAME_LEN: usize = 24; +pub type XLogRecPtr = u64; +pub type XLogSegNo = u64; +pub type TimeLineID = u32; + +#[allow(non_snake_case)] +pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + +#[allow(non_snake_case)] +pub fn IsXLogFileName(fname: &str) -> bool { + return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); +} + +#[allow(non_snake_case)] +pub fn IsPartialXLogFileName(fname: &str) -> bool { + if let Some(basefname) = fname.strip_suffix(".partial") { + IsXLogFileName(basefname) + } else { + false + } +} + +#[derive(Debug, Clone)] +struct FilePathError { + msg: String, +} + +impl Error for FilePathError { + fn description(&self) -> &str { + &self.msg + } +} +impl FilePathError { + fn new(msg: &str) -> FilePathError { + FilePathError { + msg: msg.to_string(), + } + } +} + +impl From for FilePathError { + fn from(e: core::num::ParseIntError) -> Self { + return FilePathError { + msg: format!("invalid filename: {}", e), + }; + } +} + +impl fmt::Display for FilePathError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "invalid filename") + } +} + +fn forkname_to_forknum(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(0), + Some("fsm") => Ok(1), + Some("vm") => Ok(2), + Some("init") => Ok(3), + Some(_) => Err(FilePathError::new("invalid forkname")), + } +} + +#[derive(Debug)] +struct ParsedBaseImageFileName { + pub spcnode: u32, + pub dbnode: u32, + pub relnode: u32, + pub forknum: u32, + pub segno: u32, + + pub lsn: u64, +} + +// formats: +// +// _ +// . +// _. + +fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> { + let re = Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); + + let caps = re + .captures(fname) + .ok_or_else(|| FilePathError::new("invalid relation data file name"))?; + + let relnode_str = caps.name("relnode").unwrap().as_str(); + let relnode = u32::from_str_radix(relnode_str, 10)?; + + let forkname_match = caps.name("forkname"); + let forkname = if forkname_match.is_none() { + None + } else { + Some(forkname_match.unwrap().as_str()) + }; + let forknum = forkname_to_forknum(forkname)?; + + let segno_match = caps.name("segno"); + let segno = if segno_match.is_none() { + 0 + } else { + u32::from_str_radix(segno_match.unwrap().as_str(), 10)? + }; + + return Ok((relnode, forknum, segno)); +} diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 33d8f09693..40a2ac7a43 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -1,14 +1,7 @@ -//#![allow(non_upper_case_globals)] -//#![allow(non_camel_case_types)] -//#![allow(non_snake_case)] -//#![allow(dead_code)] -//include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use std::cmp::min; - use log::*; +use std::cmp::min; +use thiserror::Error; const XLOG_BLCKSZ: u32 = 8192; @@ -19,7 +12,7 @@ const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024; #[repr(C)] #[derive(Debug)] -struct XLogPageHeaderData { +pub struct XLogPageHeaderData { xlp_magic: u16, /* magic value for correctness checks */ xlp_info: u16, /* flag bits, see below */ xlp_tli: u32, /* TimeLineID of first record on page */ @@ -33,7 +26,7 @@ const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4; #[repr(C)] #[derive(Debug)] -struct XLogLongPageHeaderData { +pub struct XLogLongPageHeaderData { std: XLogPageHeaderData, /* standard header fields */ xlp_sysid: u64, /* system identifier from pg_control */ xlp_seg_size: u32, /* just as a cross-check */ @@ -57,6 +50,13 @@ pub struct WalStreamDecoder { recordbuf: BytesMut, } +#[derive(Error, Debug, Clone)] +#[error("{msg} at {lsn}")] +pub struct WalDecodeError { + msg: String, + lsn: u64, +} + // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream @@ -79,40 +79,56 @@ impl WalStreamDecoder { self.inputbuf.extend_from_slice(buf); } - // Returns a tuple: - // (end LSN, record) - pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> { + /// Attempt to decode another WAL record from the input that has been fed to the + /// decoder so far. + /// + /// Returns one of the following: + /// Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself + /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function + /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. + /// + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { loop { // parse and verify page boundaries as we go if self.lsn % WAL_SEGMENT_SIZE == 0 { // parse long header if self.inputbuf.remaining() < SizeOfXLogLongPHD { - return None; + return Ok(None); } - self.decode_XLogLongPageHeaderData(); + let hdr = self.decode_XLogLongPageHeaderData(); + if hdr.std.xlp_pageaddr != self.lsn { + return Err(WalDecodeError { + msg: "invalid xlog segment header".into(), + lsn: self.lsn, + }); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogLongPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.lsn % (XLOG_BLCKSZ as u64) == 0 { // parse page header if self.inputbuf.remaining() < SizeOfXLogShortPHD { - return None; + return Ok(None); } - self.decode_XLogPageHeaderData(); + let hdr = self.decode_XLogPageHeaderData(); + if hdr.xlp_pageaddr != self.lsn { + return Err(WalDecodeError { + msg: "invalid xlog page header".into(), + lsn: self.lsn, + }); + } + // TODO: verify the remaining fields in the header + self.lsn += SizeOfXLogShortPHD as u64; - - // TODO: verify the fields in the header - continue; } else if self.padlen > 0 { if self.inputbuf.remaining() < self.padlen as usize { - return None; + return Ok(None); } // skip padding @@ -123,20 +139,17 @@ impl WalStreamDecoder { // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { - return None; + return Ok(None); } // read xl_tot_len FIXME: assumes little-endian self.startlsn = self.lsn; let xl_tot_len = self.inputbuf.get_u32_le(); if xl_tot_len < SizeOfXLogRecord { - error!( - "invalid xl_tot_len {} at {:X}/{:X}", - xl_tot_len, - self.lsn >> 32, - self.lsn & 0xffffffff - ); - panic!(); + return Err(WalDecodeError { + msg: format!("invalid xl_tot_len {}", xl_tot_len), + lsn: self.lsn, + }); } self.lsn += 4; @@ -154,7 +167,7 @@ impl WalStreamDecoder { let n = min(self.contlen, pageleft) as usize; if self.inputbuf.remaining() < n { - return None; + return Ok(None); } self.recordbuf.put(self.inputbuf.split_to(n)); @@ -182,7 +195,7 @@ impl WalStreamDecoder { } let result = (self.lsn, recordbuf); - return Some(result); + return Ok(Some(result)); } continue; } @@ -289,7 +302,6 @@ pub struct DecodedBkpBlock { const SizeOfXLogRecord: u32 = 24; pub struct DecodedWALRecord { - pub lsn: u64, // LSN at the *end* of the record pub xl_info: u8, pub xl_rmid: u8, pub record: Bytes, // raw XLogRecord @@ -364,14 +376,7 @@ pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate { // // Routines to decode a WAL record and figure out which blocks are modified // -pub fn decode_wal_record(lsn: u64, record: Bytes) -> DecodedWALRecord { - trace!( - "decoding record with LSN {:08X}/{:08X} ({} bytes)", - lsn >> 32, - lsn & 0xffff_ffff, - record.remaining() - ); - +pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { let mut buf = record.clone(); // FIXME: assume little-endian here @@ -627,7 +632,6 @@ pub fn decode_wal_record(lsn: u64, record: Bytes) -> DecodedWALRecord { // Since we don't care about the data payloads here, we're done. return DecodedWALRecord { - lsn, xl_info, xl_rmid, record, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 692d7f466d..e483b27005 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -10,22 +10,87 @@ use crate::page_cache; use crate::page_cache::{BufferTag, RelTag}; use crate::waldecoder::*; use crate::PageServerConf; +use crate::ZTimelineId; use anyhow::Error; +use lazy_static::lazy_static; use log::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; +use std::collections::HashMap; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::{Seek, SeekFrom, Write}; +use std::path::PathBuf; use std::str::FromStr; +use std::sync::Mutex; +use std::thread; use tokio::runtime; use tokio::time::{sleep, Duration}; use tokio_postgres::replication::{PgTimestamp, ReplicationStream}; use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow}; use tokio_stream::StreamExt; +// +// We keep one WAL Receiver active per timeline. +// +struct WalReceiverEntry { + wal_producer_connstr: String, +} + +lazy_static! { + static ref WAL_RECEIVERS: Mutex> = + Mutex::new(HashMap::new()); +} + +// Launch a new WAL receiver, or tell one that's running about change in connection string +pub fn launch_wal_receiver( + conf: &PageServerConf, + timelineid: ZTimelineId, + wal_producer_connstr: &str, +) { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + + match receivers.get_mut(&timelineid) { + Some(receiver) => { + receiver.wal_producer_connstr = wal_producer_connstr.into(); + } + None => { + let receiver = WalReceiverEntry { + wal_producer_connstr: wal_producer_connstr.into(), + }; + receivers.insert(timelineid, receiver); + + // Also launch a new thread to handle this connection + let conf_copy = conf.clone(); + let _walreceiver_thread = thread::Builder::new() + .name("WAL receiver thread".into()) + .spawn(move || { + thread_main(&conf_copy, timelineid); + }) + .unwrap(); + } + }; +} + +// Look up current WAL producer connection string in the hash table +fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { + let receivers = WAL_RECEIVERS.lock().unwrap(); + + receivers + .get(&timelineid) + .unwrap() + .wal_producer_connstr + .clone() +} + // // This is the entry point for the WAL receiver thread. // -pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { - info!("WAL receiver thread started: '{}'", wal_producer_connstr); +fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!( + "WAL receiver thread started for timeline : '{}'", + timelineid + ); let runtime = runtime::Builder::new_current_thread() .enable_all() @@ -34,7 +99,10 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { runtime.block_on(async { loop { - let res = walreceiver_main(conf, wal_producer_connstr).await; + // Look up the current WAL producer address + let wal_producer_connstr = get_wal_producer_connstr(timelineid); + + let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await; if let Err(e) = res { info!( @@ -47,7 +115,11 @@ pub fn thread_main(conf: &PageServerConf, wal_producer_connstr: &str) { }); } -async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> Result<(), Error> { +async fn walreceiver_main( + conf: &PageServerConf, + timelineid: ZTimelineId, + wal_producer_connstr: &str, +) -> Result<(), Error> { // Connect to the database in replication mode. info!("connecting to {:?}", wal_producer_connstr); let connect_cfg = format!("{} replication=true", wal_producer_connstr); @@ -67,7 +139,7 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let end_of_wal = u64::from(identify.xlogpos); let mut caught_up = false; - let pcache = page_cache::get_pagecache(conf, identify.systemid); + let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap(); // // Start streaming the WAL, from where we left off previously. @@ -95,9 +167,10 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> } } debug!( - "starting replication from {:X}/{:X}, server is at {:X}/{:X}...", + "starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...", (startpoint >> 32), (startpoint & 0xffffffff), + timelineid, (end_of_wal >> 32), (end_of_wal & 0xffffffff) ); @@ -120,6 +193,13 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> let startlsn = xlog_data.wal_start(); let endlsn = startlsn + data.len() as u64; + write_wal_file( + startlsn, + timelineid, + 16 * 1024 * 1024, // FIXME + data, + )?; + trace!( "received XLogData between {:X}/{:X} and {:X}/{:X}", (startlsn >> 32), @@ -131,8 +211,8 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> waldecoder.feed_bytes(data); loop { - if let Some((lsn, recdata)) = waldecoder.poll_decode() { - let decoded = decode_wal_record(startlsn, recdata.clone()); + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let decoded = decode_wal_record(recdata.clone()); // Put the WAL record to the page cache. We make a separate copy of // it for every block it modifies. (The actual WAL record is kept in @@ -184,7 +264,7 @@ async fn walreceiver_main(conf: &PageServerConf, wal_producer_connstr: &str) -> } // Now that this record has been handled, let the page cache know that // it is up-to-date to this LSN - pcache.advance_last_valid_lsn(lsn); + pcache.advance_last_record_lsn(lsn); } else { break; } @@ -286,3 +366,152 @@ pub async fn identify_system(client: &tokio_postgres::Client) -> Result u32 { + return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1); +} + +#[allow(non_snake_case)] +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo; +} + +#[allow(non_snake_case)] +pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo { + return xlogptr / wal_segsz_bytes as u64; +} + +#[allow(non_snake_case)] +pub fn XLogSegNoOffsetToRecPtr( + segno: XLogSegNo, + offset: u32, + wal_segsz_bytes: usize, +) -> XLogRecPtr { + return segno * (wal_segsz_bytes as u64) + (offset as u64); +} + +#[allow(non_snake_case)] +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + return format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ); +} + +#[allow(non_snake_case)] +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli); +} + +fn write_wal_file( + startpos: XLogRecPtr, + timeline: ZTimelineId, + wal_seg_size: usize, + buf: &[u8], +) -> anyhow::Result<()> { + let mut bytes_left: usize = buf.len(); + let mut bytes_written: usize = 0; + let mut partial; + let mut start_pos = startpos; + const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ]; + + let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline)); + + /* Extract WAL location for this block */ + let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize; + + while bytes_left != 0 { + let bytes_to_write; + + /* + * If crossing a WAL boundary, only write up until we reach wal + * segment size. + */ + if xlogoff + bytes_left > wal_seg_size { + bytes_to_write = wal_seg_size - xlogoff; + } else { + bytes_to_write = bytes_left; + } + + /* Open file */ + let segno = XLByteToSeg(start_pos, wal_seg_size); + let wal_file_name = XLogFileName( + 1, // FIXME: always use Postgres timeline 1 + segno, + wal_seg_size, + ); + let wal_file_path = wal_dir.join(wal_file_name.clone()); + let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial"); + + { + let mut wal_file: File; + /* Try to open already completed segment */ + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + wal_file = file; + partial = false; + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { + /* Try to open existed partial file */ + wal_file = file; + partial = true; + } else { + /* Create and fill new partial file */ + partial = true; + match OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + { + Ok(mut file) => { + for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { + file.write_all(&ZERO_BLOCK)?; + } + wal_file = file; + } + Err(e) => { + error!("Failed to open log file {:?}: {}", &wal_file_path, e); + return Err(e.into()); + } + } + } + wal_file.seek(SeekFrom::Start(xlogoff as u64))?; + wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; + + // FIXME: Flush the file + //wal_file.sync_all()?; + } + /* Write was successful, advance our position */ + bytes_written += bytes_to_write; + bytes_left -= bytes_to_write; + start_pos += bytes_to_write as u64; + xlogoff += bytes_to_write; + + /* Did we reach the end of a WAL segment? */ + if XLogSegmentOffset(start_pos, wal_seg_size) == 0 { + xlogoff = 0; + if partial { + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } + } + } + Ok(()) +} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b80af79862..400c8c59da 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,10 +21,10 @@ use std::fs; use std::fs::OpenOptions; use std::io::prelude::*; use std::io::Error; +use std::process::Stdio; use std::sync::Arc; use std::time::Duration; use std::time::Instant; -use std::{path::PathBuf, process::Stdio}; use tokio::io::AsyncBufReadExt; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::process::{Child, ChildStdin, ChildStdout, Command}; @@ -36,6 +36,7 @@ use bytes::{BufMut, Bytes, BytesMut}; use crate::page_cache; use crate::page_cache::CacheEntry; use crate::page_cache::WALRecord; +use crate::ZTimelineId; use crate::{page_cache::BufferTag, PageServerConf}; static TIMEOUT: Duration = Duration::from_secs(20); @@ -43,8 +44,8 @@ static TIMEOUT: Duration = Duration::from_secs(20); // // Main entry point for the WAL applicator thread. // -pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { - info!("WAL redo thread started {}", sys_id); +pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("WAL redo thread started {}", timelineid); // We block on waiting for requests on the walredo request channel, but // use async I/O to communicate with the child process. Initialize the @@ -54,15 +55,15 @@ pub fn wal_redo_main(conf: &PageServerConf, sys_id: u64) { .build() .unwrap(); - let pcache = page_cache::get_pagecache(conf, sys_id); + let pcache = page_cache::get_pagecache(conf, timelineid).unwrap(); // Loop forever, handling requests as they come. let walredo_channel_receiver = &pcache.walredo_receiver; loop { let mut process: WalRedoProcess; - let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id)); + let datadir = format!("wal-redo/{}", timelineid); - info!("launching WAL redo postgres process {}", sys_id); + info!("launching WAL redo postgres process {}", timelineid); { let _guard = runtime.enter(); process = WalRedoProcess::launch(&datadir, &runtime).unwrap(); @@ -148,13 +149,13 @@ impl WalRedoProcess { // Tests who run pageserver binary are setting proper PG_BIN_DIR // and PG_LIB_DIR so that WalRedo would start right postgres. We may later // switch to setting same things in pageserver config file. - fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result { + fn launch(datadir: &str, runtime: &Runtime) -> Result { // Create empty data directory for wal-redo postgres deleting old one. - fs::remove_dir_all(datadir.to_str().unwrap()).ok(); + fs::remove_dir_all(datadir).ok(); let initdb = runtime .block_on( Command::new("initdb") - .args(&["-D", datadir.to_str().unwrap()]) + .args(&["-D", datadir]) .arg("-N") .output(), ) @@ -180,14 +181,11 @@ impl WalRedoProcess { .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) - .env("PGDATA", datadir.to_str().unwrap()) + .env("PGDATA", datadir) .spawn() .expect("postgres --wal-redo command failed to start"); - info!( - "launched WAL redo postgres process on {}", - datadir.to_str().unwrap() - ); + info!("launched WAL redo postgres process on {}", datadir); let stdin = child.stdin.take().expect("failed to open child's stdin"); let stderr = child.stderr.take().expect("failed to open child's stderr"); diff --git a/postgres_ffi/Cargo.toml b/postgres_ffi/Cargo.toml new file mode 100644 index 0000000000..77cc5cf028 --- /dev/null +++ b/postgres_ffi/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "postgres_ffi" +version = "0.1.0" +authors = ["Heikki Linnakangas "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = "0.4.19" +rand = "0.8.3" +bytes = "1.0.1" +byteorder = "1.4.3" +anyhow = "1.0" +crc32c = "0.6.0" +hex = "0.4.3" + +[build-dependencies] +bindgen = "0.53.1" diff --git a/postgres_ffi/build.rs b/postgres_ffi/build.rs new file mode 100644 index 0000000000..dc3e1509c0 --- /dev/null +++ b/postgres_ffi/build.rs @@ -0,0 +1,42 @@ +extern crate bindgen; + +use std::env; +use std::path::PathBuf; + +fn main() { + // Tell cargo to invalidate the built crate whenever the wrapper changes + println!("cargo:rerun-if-changed=pg_control_ffi.h"); + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header("pg_control_ffi.h") + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks)) + .whitelist_type("ControlFileData") + .whitelist_var("PG_CONTROL_FILE_SIZE") + .whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") + .whitelist_type("DBState") + // Path the server include dir. It is in tmp_install/include/server, if you did + // "configure --prefix=". But if you used "configure --prefix=/", + // and used DESTDIR to move it into tmp_install, then it's in + // tmp_install/include/postgres/server (that's how the pgbuild.sh script does it). + // 'pg_config --includedir-server' would perhaps be the more proper way to find it, + // but this will do for now. + .clang_arg("-I../tmp_install/include/server") + .clang_arg("-I../tmp_install/include/postgresql/server") + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} diff --git a/postgres_ffi/pg_control_ffi.h b/postgres_ffi/pg_control_ffi.h new file mode 100644 index 0000000000..169e66977b --- /dev/null +++ b/postgres_ffi/pg_control_ffi.h @@ -0,0 +1,4 @@ +#include "c.h" +#include "catalog/pg_control.h" + +const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc); diff --git a/postgres_ffi/src/lib.rs b/postgres_ffi/src/lib.rs new file mode 100644 index 0000000000..b6cf6bdb2b --- /dev/null +++ b/postgres_ffi/src/lib.rs @@ -0,0 +1,67 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +use bytes::{Buf, Bytes, BytesMut}; + +// sizeof(ControlFileData) +const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize; + +impl ControlFileData { + // Initialize an all-zeros ControlFileData struct + pub fn new() -> ControlFileData { + let controlfile: ControlFileData; + + let b = [0u8; SIZEOF_CONTROLDATA]; + controlfile = + unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) }; + + return controlfile; + } +} + +pub fn decode_pg_control(buf: Bytes) -> Result { + let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA]; + buf.clone().copy_to_slice(&mut b); + + let controlfile: ControlFileData; + + // TODO: verify CRC + let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC]; + data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]); + let expectedcrc = crc32c::crc32c(&data_without_crc); + + controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) }; + + if expectedcrc != controlfile.crc { + anyhow::bail!( + "invalid CRC in control file: expected {:08X}, was {:08X}", + expectedcrc, + controlfile.crc + ); + } + + Ok(controlfile) +} + +pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes { + let b: [u8; SIZEOF_CONTROLDATA]; + + b = unsafe { std::mem::transmute::(controlfile) }; + + // Recompute the CRC + let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC]; + data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]); + let newcrc = crc32c::crc32c(&data_without_crc); + + let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize); + + buf.extend_from_slice(&b[0..OFFSETOF_CRC]); + buf.extend_from_slice(&newcrc.to_ne_bytes()); + // Fill the rest of the control file with zeros. + buf.resize(PG_CONTROL_FILE_SIZE as usize, 0); + + return buf.into(); +} diff --git a/vendor/postgres b/vendor/postgres index 9f9aa9c300..b898ad7e3b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9f9aa9c300c9bbac296e2c126b3f96701d4e683d +Subproject commit b898ad7e3b9acce72b64bf064257e392f979a659 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index 98c63c434f..27498ee293 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -34,3 +34,6 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } anyhow = "1.0" crc32c = "0.6.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/wal_acceptor.rs index d50467ba49..8dfa31e23b 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/wal_acceptor.rs @@ -9,6 +9,7 @@ use std::path::PathBuf; use std::thread; use std::{fs::File, fs::OpenOptions}; +use anyhow::Result; use clap::{App, Arg}; use slog::Drain; @@ -16,7 +17,7 @@ use slog::Drain; use walkeeper::wal_service; use walkeeper::WalAcceptorConf; -fn main() -> Result<(), io::Error> { +fn main() -> Result<()> { let arg_matches = App::new("Zenith wal_acceptor") .about("Store WAL stream to local file system and push it to WAL receivers") .arg( @@ -26,6 +27,13 @@ fn main() -> Result<(), io::Error> { .takes_value(true) .help("Path to the WAL acceptor data directory"), ) + .arg( + Arg::with_name("systemid") + .long("systemid") + .takes_value(true) + .required(true) + .help("PostgreSQL system id, from pg_control"), + ) .arg( Arg::with_name("listen") .short("l") @@ -56,16 +64,23 @@ fn main() -> Result<(), io::Error> { ) .get_matches(); + let systemid_str = arg_matches.value_of("systemid").unwrap(); + let systemid: u64 = systemid_str.parse()?; + let mut conf = WalAcceptorConf { data_dir: PathBuf::from("./"), + systemid: systemid, daemonize: false, no_sync: false, pageserver_addr: None, - listen_addr: "127.0.0.1:5454".parse().unwrap(), + listen_addr: "127.0.0.1:5454".parse()?, }; if let Some(dir) = arg_matches.value_of("datadir") { conf.data_dir = PathBuf::from(dir); + + // change into the data directory. + std::env::set_current_dir(&conf.data_dir)?; } if arg_matches.is_present("no-sync") { @@ -87,7 +102,7 @@ fn main() -> Result<(), io::Error> { start_wal_acceptor(conf) } -fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { +fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { // Initialize logger let _scope_guard = init_logging(&conf)?; let _log_guard = slog_stdlog::init().unwrap(); @@ -98,20 +113,20 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> { info!("daemonizing..."); // There should'n be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fpritf's or backtraces. + // that we will see any accidental manual fprintf's or backtraces. let stdout = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("wal_acceptor.log")) + .open("wal_acceptor.log") .unwrap(); let stderr = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("wal_acceptor.log")) + .open("wal_acceptor.log") .unwrap(); let daemonize = Daemonize::new() - .pid_file(conf.data_dir.join("wal_acceptor.pid")) + .pid_file("wal_acceptor.pid") .working_directory(Path::new(".")) .stdout(stdout) .stderr(stderr); diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 7e890cf98a..784ab730b6 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -6,9 +6,12 @@ mod pq_protocol; pub mod wal_service; pub mod xlog_utils; +use crate::pq_protocol::SystemId; + #[derive(Debug, Clone)] pub struct WalAcceptorConf { pub data_dir: PathBuf, + pub systemid: SystemId, pub daemonize: bool, pub no_sync: bool, pub listen_addr: SocketAddr, diff --git a/walkeeper/src/pq_protocol.rs b/walkeeper/src/pq_protocol.rs index 286d563b73..f6e18d9aa4 100644 --- a/walkeeper/src/pq_protocol.rs +++ b/walkeeper/src/pq_protocol.rs @@ -1,7 +1,9 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use pageserver::ZTimelineId; use std::io; use std::str; +use std::str::FromStr; pub type Oid = u32; pub type SystemId = u64; @@ -37,7 +39,7 @@ pub enum BeMessage<'a> { pub struct FeStartupMessage { pub version: u32, pub kind: StartupRequestCode, - pub system_id: SystemId, + pub timelineid: ZTimelineId, } #[derive(Debug)] @@ -83,26 +85,33 @@ impl FeStartupMessage { let params_str = str::from_utf8(¶ms_bytes).unwrap(); let params = params_str.split('\0'); let mut options = false; - let mut system_id: u64 = 0; + let mut timelineid: Option = None; for p in params { if p == "options" { options = true; } else if options { for opt in p.split(' ') { - if opt.starts_with("system.id=") { - system_id = opt[10..].parse::().unwrap(); + if opt.starts_with("ztimelineid=") { + // FIXME: rethrow parsing error, don't unwrap + timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap()); break; } } break; } } + if timelineid.is_none() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "timelineid is required", + )); + } buf.advance(len as usize); Ok(Some(FeMessage::StartupMessage(FeStartupMessage { version, kind, - system_id, + timelineid: timelineid.unwrap(), }))) } } @@ -146,20 +155,20 @@ impl<'a> BeMessage<'a> { BeMessage::RowDescription(rows) => { buf.put_u8(b'T'); - let total_len: u32 = rows - .iter() - .fold(0, |acc, row| acc + row.name.len() as u32 + 3 * (4 + 2)); - buf.put_u32(4 + 2 + total_len); + + let mut body = BytesMut::new(); + body.put_i16(rows.len() as i16); // # of fields for row in rows.iter() { - buf.put_i16(row.name.len() as i16); - buf.put_slice(row.name); - buf.put_i32(0); /* table oid */ - buf.put_i16(0); /* attnum */ - buf.put_u32(row.typoid); - buf.put_i16(row.typlen); - buf.put_i32(-1); /* typmod */ - buf.put_i16(0); /* format code */ + body.put_slice(row.name); + body.put_i32(0); /* table oid */ + body.put_i16(0); /* attnum */ + body.put_u32(row.typoid); + body.put_i16(row.typlen); + body.put_i32(-1); /* typmod */ + body.put_i16(0); /* format code */ } + buf.put_i32((4 + body.len()) as i32); // # of bytes, including len field itself + buf.put(body); } BeMessage::DataRow(vals) => { diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index 5570781123..74e0f1d3b7 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -33,6 +33,7 @@ use tokio_postgres::{connect, Error, NoTls}; use crate::pq_protocol::*; use crate::xlog_utils::*; use crate::WalAcceptorConf; +use pageserver::ZTimelineId; type FullTransactionId = u64; @@ -64,7 +65,8 @@ struct ServerInfo { protocol_version: u32, /* proxy-safekeeper protocol version */ pg_version: u32, /* Postgres server version */ node_id: NodeId, - system_id: SystemId, /* Postgres system identifier */ + system_id: SystemId, + timeline_id: ZTimelineId, /* Zenith timelineid */ wal_end: XLogRecPtr, timeline: TimeLineID, wal_seg_size: u32, @@ -146,8 +148,8 @@ struct SharedState { * Database instance (tenant) */ #[derive(Debug)] -pub struct System { - id: SystemId, +pub struct Timeline { + timelineid: ZTimelineId, mutex: Mutex, cond: Notify, /* conditional variable used to notify wal senders */ } @@ -157,7 +159,7 @@ pub struct System { */ #[derive(Debug)] struct Connection { - system: Option>, + timeline: Option>, stream: TcpStream, /* Postgres connection */ inbuf: BytesMut, /* input buffer */ outbuf: BytesMut, /* output buffer */ @@ -211,6 +213,7 @@ impl Serializer for ServerInfo { buf.put_u32_le(self.pg_version); self.node_id.pack(buf); buf.put_u64_le(self.system_id); + buf.put_slice(&self.timeline_id.as_arr()); buf.put_u64_le(self.wal_end); buf.put_u32_le(self.timeline); buf.put_u32_le(self.wal_seg_size); @@ -221,6 +224,7 @@ impl Serializer for ServerInfo { pg_version: buf.get_u32_le(), node_id: NodeId::unpack(buf), system_id: buf.get_u64_le(), + timeline_id: ZTimelineId::get_from_buf(buf), wal_end: buf.get_u64_le(), timeline: buf.get_u32_le(), wal_seg_size: buf.get_u32_le(), @@ -278,6 +282,7 @@ impl SafeKeeperInfo { pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ node_id: NodeId { term: 0, uuid: 0 }, system_id: 0, /* Postgres system identifier */ + timeline_id: ZTimelineId::from([0u8; 16]), wal_end: 0, timeline: 0, wal_seg_size: 0, @@ -349,7 +354,8 @@ impl Serializer for SafeKeeperResponse { } lazy_static! { - pub static ref SYSTEMS: Mutex>> = Mutex::new(HashMap::new()); + pub static ref TIMELINES: Mutex>> = + Mutex::new(HashMap::new()); } pub fn thread_main(conf: WalAcceptorConf) { @@ -366,7 +372,7 @@ pub fn thread_main(conf: WalAcceptorConf) { info!("Starting wal acceptor on {}", conf.listen_addr); runtime.block_on(async { - let _unused = main_loop(&conf).await; + main_loop(&conf).await.unwrap(); }); } @@ -389,8 +395,8 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> { } } -impl System { - pub fn new(id: SystemId) -> System { +impl Timeline { + pub fn new(timelineid: ZTimelineId) -> Timeline { let shared_state = SharedState { commit_lsn: 0, info: SafeKeeperInfo::new(), @@ -401,8 +407,8 @@ impl System { catalog_xmin: u64::MAX, }, }; - System { - id, + Timeline { + timelineid, mutex: Mutex::new(shared_state), cond: Notify::new(), } @@ -443,12 +449,23 @@ impl System { return shared_state.hs_feedback; } - // Load and lock control file (prevent running more than one instance of safekeeper - fn load_control_file(&self, conf: &WalAcceptorConf) { + // Load and lock control file (prevent running more than one instance of safekeeper) + fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> { + let mut shared_state = self.mutex.lock().unwrap(); + + if shared_state.control_file.is_some() { + info!( + "control file for timeline {} is already open", + self.timelineid + ); + return Ok(()); + } + let control_file_path = conf .data_dir - .join(self.id.to_string()) + .join(self.timelineid.to_string()) .join(CONTROL_FILE_NAME); + info!("loading control file {}", control_file_path.display()); match OpenOptions::new() .read(true) .write(true) @@ -460,13 +477,13 @@ impl System { match file.try_lock_exclusive() { Ok(()) => {} Err(e) => { - panic!( + io_error!( "Control file {:?} is locked by some other process: {}", - &control_file_path, e + &control_file_path, + e ); } } - let mut shared_state = self.mutex.lock().unwrap(); shared_state.control_file = Some(file); const SIZE: usize = mem::size_of::(); @@ -483,12 +500,13 @@ impl System { let my_info = SafeKeeperInfo::unpack(&mut input); if my_info.magic != SK_MAGIC { - panic!("Invalid control file magic: {}", my_info.magic); + io_error!("Invalid control file magic: {}", my_info.magic); } if my_info.format_version != SK_FORMAT_VERSION { - panic!( + io_error!( "Incompatible format version: {} vs. {}", - my_info.format_version, SK_FORMAT_VERSION + my_info.format_version, + SK_FORMAT_VERSION ); } shared_state.info = my_info; @@ -501,6 +519,7 @@ impl System { ); } } + Ok(()) } fn save_control_file(&self, sync: bool) -> Result<()> { @@ -521,7 +540,7 @@ impl System { impl Connection { pub fn new(socket: TcpStream, conf: &WalAcceptorConf) -> Connection { Connection { - system: None, + timeline: None, stream: socket, inbuf: BytesMut::with_capacity(10 * 1024), outbuf: BytesMut::with_capacity(10 * 1024), @@ -530,8 +549,8 @@ impl Connection { } } - fn system(&self) -> Arc { - self.system.as_ref().unwrap().clone() + fn timeline(&self) -> Arc { + self.timeline.as_ref().unwrap().clone() } async fn run(&mut self) -> Result<()> { @@ -563,10 +582,15 @@ impl Connection { "no_user", ); let callme = format!( - "callmemaybe host={} port={} replication=1 options='-c system.id={}'", + "callmemaybe {} host={} port={} options='-c ztimelineid={}'", + self.timeline().timelineid, self.conf.listen_addr.ip(), self.conf.listen_addr.port(), - self.system().get_info().server.system_id, + self.timeline().timelineid + ); + info!( + "requesting page server to connect to us: start {} {}", + ps_connstr, callme ); let (client, connection) = connect(&ps_connstr, NoTls).await?; @@ -582,22 +606,14 @@ impl Connection { Ok(()) } - fn set_system(&mut self, id: SystemId) -> Result<()> { - let mut systems = SYSTEMS.lock().unwrap(); - if id == 0 { - // non-multitenant configuration: just a single instance - if let Some(system) = systems.values().next() { - self.system = Some(system.clone()); - return Ok(()); - } - io_error!("No active instances"); + fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> { + let mut timelines = TIMELINES.lock().unwrap(); + if !timelines.contains_key(&timelineid) { + info!("creating timeline dir {}", timelineid); + fs::create_dir_all(timelineid.to_string())?; + timelines.insert(timelineid, Arc::new(Timeline::new(timelineid))); } - if !systems.contains_key(&id) { - let system_dir = self.conf.data_dir.join(id.to_string()); - fs::create_dir_all(system_dir)?; - systems.insert(id, Arc::new(System::new(id))); - } - self.system = Some(systems.get(&id).unwrap().clone()); + self.timeline = Some(timelines.get(&timelineid).unwrap().clone()); Ok(()) } @@ -606,14 +622,16 @@ impl Connection { // Receive information about server let server_info = self.read_req::().await?; info!( - "Start handshake with wal_proposer {} sysid {}", + "Start handshake with wal_proposer {} sysid {} timeline {}", self.stream.peer_addr()?, - server_info.system_id + server_info.system_id, + server_info.timeline_id, ); - self.set_system(server_info.system_id)?; - self.system().load_control_file(&self.conf); + // FIXME: also check that the system identifier matches + self.set_timeline(server_info.timeline_id)?; + self.timeline().load_control_file(&self.conf)?; - let mut my_info = self.system().get_info(); + let mut my_info = self.timeline().get_info(); /* Check protocol compatibility */ if server_info.protocol_version != SK_PROTOCOL_VERSION { @@ -662,9 +680,9 @@ impl Connection { ); } my_info.server.node_id = prop.node_id; - self.system().set_info(&my_info); + self.timeline().set_info(&my_info); /* Need to persist our vote first */ - self.system().save_control_file(true)?; + self.timeline().save_control_file(true)?; let mut flushed_restart_lsn: XLogRecPtr = 0; let wal_seg_size = server_info.wal_seg_size as usize; @@ -678,12 +696,13 @@ impl Connection { // Add far as replication in postgres is initiated by receiver, we should use callme mechanism if let Err(e) = self.request_callback().await { // Do not treate it as fatal error and continue work + // FIXME: we should retry after a while... error!("Failed to send callme request to pageserver: {}", e); } info!( - "Start streaming from server {} address {:?}", - server_info.system_id, + "Start streaming from timeline {} address {:?}", + server_info.timeline_id, self.stream.peer_addr()? ); @@ -705,6 +724,15 @@ impl Connection { let rec_size = (end_pos - start_pos) as usize; assert!(rec_size <= MAX_SEND_SIZE); + debug!( + "received for {} bytes between {:X}/{:X} and {:X}/{:X}", + rec_size, + start_pos >> 32, + start_pos & 0xffffffff, + end_pos >> 32, + end_pos & 0xffffffff + ); + /* Receive message body */ self.inbuf.resize(rec_size, 0u8); self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?; @@ -735,7 +763,7 @@ impl Connection { * when restart_lsn delta exceeds WAL segment size. */ sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn; - self.system().save_control_file(sync_control_file)?; + self.timeline().save_control_file(sync_control_file)?; if sync_control_file { flushed_restart_lsn = my_info.restart_lsn; @@ -746,7 +774,7 @@ impl Connection { let resp = SafeKeeperResponse { epoch: my_info.epoch, flush_lsn: end_pos, - hs_feedback: self.system().get_hs_feedback(), + hs_feedback: self.timeline().get_hs_feedback(), }; self.start_sending(); resp.pack(&mut self.outbuf); @@ -756,7 +784,7 @@ impl Connection { * Ping wal sender that new data is available. * FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper. */ - self.system() + self.timeline() .notify_wal_senders(min(req.commit_lsn, end_pos)); } Ok(()) @@ -807,7 +835,7 @@ impl Connection { } // - // Send WAL to replica or WAL sender using standard libpq replication protocol + // Send WAL to replica or WAL receiver using standard libpq replication protocol // async fn send_wal(&mut self) -> Result<()> { info!("WAL sender to {:?} is started", self.stream.peer_addr()?); @@ -828,7 +856,7 @@ impl Connection { BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); self.send().await?; self.init_done = true; - self.set_system(m.system_id)?; + self.set_timeline(m.timelineid)?; } StartupRequestCode::Cancel => return Ok(()), } @@ -861,7 +889,7 @@ impl Connection { let (start_pos, timeline) = self.find_end_of_wal(false); let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32); let tli = timeline.to_string(); - let sysid = self.system().get_info().server.system_id.to_string(); + let sysid = self.timeline().get_info().server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli_bytes = tli.as_bytes(); let sysid_bytes = sysid.as_bytes(); @@ -893,11 +921,11 @@ impl Connection { ); BeMessage::write( &mut self.outbuf, - &BeMessage::DataRow(&[Some(lsn_bytes), Some(tli_bytes), Some(sysid_bytes), None]), + &BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]), ); BeMessage::write( &mut self.outbuf, - &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"), + &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"), ); BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery); self.send().await?; @@ -917,7 +945,7 @@ impl Connection { } else { 0 }; - let wal_seg_size = self.system().get_info().server.wal_seg_size as usize; + let wal_seg_size = self.timeline().get_info().server.wal_seg_size as usize; if wal_seg_size == 0 { io_error!("Can not start replication before connecting to wal_proposer"); } @@ -935,15 +963,6 @@ impl Connection { BeMessage::write(&mut self.outbuf, &BeMessage::Copy); self.send().await?; - /* - * Always start streaming at the beginning of a segment - * - * FIXME: It is common practice to start streaming at the beginning of - * the segment, but it should be up to the client to decide that. We - * shouldn't enforce that here. - */ - start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64; - let mut end_pos: XLogRecPtr; let mut commit_lsn: XLogRecPtr; let mut wal_file: Option = None; @@ -960,19 +979,18 @@ impl Connection { end_pos = stop_pos; } else { /* normal mode */ + let timeline = self.timeline(); loop { // Rust doesn't allow to grab async result from mutex scope - let system = self.system(); - let notified = system.cond.notified(); { - let shared_state = system.mutex.lock().unwrap(); + let shared_state = timeline.mutex.lock().unwrap(); commit_lsn = shared_state.commit_lsn; if start_pos < commit_lsn { end_pos = commit_lsn; break; } } - notified.await; + timeline.cond.notified().await; } } if end_pos == END_REPLICATION_MARKER { @@ -983,7 +1001,7 @@ impl Connection { Ok(0) => break, Ok(_) => match self.parse_message()? { Some(FeMessage::CopyData(m)) => self - .system() + .timeline() .add_hs_feedback(HotStandbyFeedback::parse(&m.body)), _ => {} }, @@ -1004,7 +1022,7 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone() + ".partial"); if let Ok(opened_file) = File::open(&wal_file_path) { file = opened_file; @@ -1012,7 +1030,7 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name); match File::open(&wal_file_path) { Ok(opened_file) => file = opened_file, @@ -1034,6 +1052,8 @@ impl Connection { let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size; let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE; let data_end = data_start + send_size; + + file.seek(SeekFrom::Start(xlogoff as u64))?; file.read_exact(&mut self.outbuf[data_start..data_end])?; self.outbuf[0] = b'd'; BigEndian::write_u32( @@ -1048,6 +1068,12 @@ impl Connection { self.stream.write_all(&self.outbuf[0..msg_size]).await?; start_pos += send_size as u64; + debug!( + "Sent WAL to page server up to {:X}/{:>08X}", + (end_pos >> 32) as u32, + end_pos as u32 + ); + if XLogSegmentOffset(start_pos, wal_seg_size) != 0 { wal_file = Some(file); } @@ -1102,12 +1128,12 @@ impl Connection { let wal_file_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone()); let wal_file_partial_path = self .conf .data_dir - .join(self.system().id.to_string()) + .join(self.timeline().timelineid.to_string()) .join(wal_file_name.clone() + ".partial"); { @@ -1170,7 +1196,7 @@ impl Connection { fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) { find_end_of_wal( &self.conf.data_dir, - self.system().get_info().server.wal_seg_size as usize, + self.timeline().get_info().server.wal_seg_size as usize, precise, ) } diff --git a/walkeeper/src/xlog_utils.rs b/walkeeper/src/xlog_utils.rs index 51db9681a6..7c18131186 100644 --- a/walkeeper/src/xlog_utils.rs +++ b/walkeeper/src/xlog_utils.rs @@ -4,7 +4,7 @@ use log::*; use std::cmp::min; use std::fs::{self, File}; use std::io::prelude::*; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::SystemTime; pub const XLOG_FNAME_LEN: usize = 24; @@ -89,7 +89,7 @@ pub fn get_current_timestamp() -> TimestampTz { } fn find_end_of_wal_segment( - data_dir: &PathBuf, + data_dir: &Path, segno: XLogSegNo, tli: TimeLineID, wal_seg_size: usize, @@ -185,7 +185,7 @@ fn find_end_of_wal_segment( } pub fn find_end_of_wal( - data_dir: &PathBuf, + data_dir: &Path, wal_seg_size: usize, precise: bool, ) -> (XLogRecPtr, TimeLineID) { diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 2d1f7c922c..035fcc9d94 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -8,4 +8,10 @@ edition = "2018" [dependencies] clap = "2.33.0" +anyhow = "1.0" + +# FIXME: 'pageserver' is needed for ZTimelineId. Refactor +pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } control_plane = { path = "../control_plane" } +postgres_ffi = { path = "../postgres_ffi" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index f6690dd8d7..53d1528a6b 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,64 +1,94 @@ -use clap::{App, Arg, ArgMatches, SubCommand}; -use std::error; +use std::fs; +use std::path::{Path, PathBuf}; use std::process::exit; +use std::str::FromStr; +use anyhow::Result; +use anyhow::{anyhow, bail}; +use clap::{App, Arg, ArgMatches, SubCommand}; + +use control_plane::local_env::LocalEnv; +use control_plane::storage::PageServerNode; use control_plane::{compute::ComputeControlPlane, local_env, storage}; -type Result = std::result::Result>; +use pageserver::ZTimelineId; -fn main() { +fn zenith_repo_dir() -> PathBuf { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => PathBuf::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} + +// Main entry point for the 'zenith' CLI utility +// +// This utility can used to work with a local zenith repository. +// In order to run queries in it, you need to launch the page server, +// and a compute node against the page server +fn main() -> Result<()> { let name_arg = Arg::with_name("NAME") .short("n") .index(1) .help("name of this postgres instance") .required(true); let matches = App::new("zenith") - .subcommand(SubCommand::with_name("init")) - .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) - .subcommand(SubCommand::with_name("status")) + .about("Zenith CLI") + .subcommand( + SubCommand::with_name("init") + .about("Initialize a new Zenith repository in current directory"), + ) + .subcommand( + SubCommand::with_name("branch") + .about("Create a new branch") + .arg(Arg::with_name("branchname").required(false).index(1)) + .arg(Arg::with_name("start-point").required(false).index(2)), + ) + .subcommand( + SubCommand::with_name("pageserver") + .about("Manage pageserver instance") + .subcommand(SubCommand::with_name("status")) + .subcommand(SubCommand::with_name("start")) + .subcommand(SubCommand::with_name("stop")), + ) .subcommand( SubCommand::with_name("pg") .about("Manage postgres instances") .subcommand( - SubCommand::with_name("create"), // .arg(name_arg.clone() - // .required(false) - // .help("name of this postgres instance (will be pgN if omitted)")) + SubCommand::with_name("create") + // .arg(name_arg.clone() + // .required(false) + // .help("name of this postgres instance (will be pgN if omitted)")) + .arg(Arg::with_name("timeline").required(false).index(1)), ) .subcommand(SubCommand::with_name("list")) .subcommand(SubCommand::with_name("start").arg(name_arg.clone())) .subcommand(SubCommand::with_name("stop").arg(name_arg.clone())) .subcommand(SubCommand::with_name("destroy").arg(name_arg.clone())), ) - .subcommand( - SubCommand::with_name("snapshot") - .about("Manage database snapshots") - .subcommand(SubCommand::with_name("create")) - .subcommand(SubCommand::with_name("start")) - .subcommand(SubCommand::with_name("stop")) - .subcommand(SubCommand::with_name("destroy")), - ) .get_matches(); // handle init separately and exit - if let Some("init") = matches.subcommand_name() { - match local_env::init() { - Ok(_) => { - println!("Initialization complete! You may start zenith with 'zenith start' now."); - exit(0); - } - Err(e) => { - eprintln!("Error during init: {}", e); - exit(1); - } - } + if let ("init", Some(sub_args)) = matches.subcommand() { + run_init_cmd(sub_args.clone())?; + exit(0); } // all other commands would need config - let env = match local_env::load_config() { + + let repopath = PathBuf::from(zenith_repo_dir()); + if !repopath.exists() { + bail!( + "Zenith repository does not exists in {}.\n\ + Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'", + repopath.display() + ); + } + // TODO: check that it looks like a zenith repository + let env = match local_env::load_config(&repopath) { Ok(conf) => conf, Err(e) => { - eprintln!("Error loading config from ~/.zenith: {}", e); + eprintln!("Error loading config from {}: {}", repopath.display(), e); exit(1); } }; @@ -68,6 +98,9 @@ fn main() { panic!() /* Should not happen. Init was handled before */ } + ("branch", Some(sub_args)) => run_branch_cmd(&env, sub_args.clone())?, + ("pageserver", Some(sub_args)) => run_pageserver_cmd(&env, sub_args.clone())?, + ("start", Some(_sub_m)) => { let pageserver = storage::PageServerNode::from_env(&env); @@ -94,15 +127,53 @@ fn main() { } } _ => {} - } + }; + + Ok(()) +} + +fn run_pageserver_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + match args.subcommand() { + ("status", Some(_sub_m)) => { + todo!(); + } + ("start", Some(_sub_m)) => { + let psnode = PageServerNode::from_env(local_env); + psnode.start()?; + println!("Page server started"); + } + ("stop", Some(_sub_m)) => { + todo!(); + } + _ => unreachable!(), + }; + + Ok(()) +} + +// Peek into the repository, to grab the timeline ID of given branch +pub fn get_branch_timeline(repopath: &Path, branchname: &str) -> ZTimelineId { + let branchpath = repopath.join("refs/branches/".to_owned() + branchname); + + ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap() } fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; match pg_match.subcommand() { - ("create", Some(_sub_m)) => { - cplane.new_node()?; + ("create", Some(sub_m)) => { + // FIXME: cheat and resolve the timeline by peeking into the + // repository. In reality, when you're launching a compute node + // against a possibly-remote page server, we wouldn't know what + // branches exist in the remote repository. Or would we require + // that you "zenith fetch" them into a local repoitory first? + let timeline_arg = sub_m.value_of("timeline").unwrap_or("main"); + let timeline = get_branch_timeline(&env.repo_path, timeline_arg); + + println!("Initializing Postgres on timeline {}...", timeline); + + cplane.new_node(timeline)?; } ("list", Some(_sub_m)) => { println!("NODE\tADDRESS\tSTATUS"); @@ -115,7 +186,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.start()?; } ("stop", Some(sub_m)) => { @@ -123,7 +194,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes .get(name) - .ok_or(format!("postgres {} is not found", name))?; + .ok_or(anyhow!("postgres {} is not found", name))?; node.stop()?; } @@ -132,3 +203,134 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Ok(()) } + +// "zenith init" - Initialize a new Zenith repository in current dir +fn run_init_cmd(_args: ArgMatches) -> Result<()> { + local_env::init()?; + Ok(()) +} + +// handle "zenith branch" subcommand +fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> { + let repopath = local_env.repo_path.to_str().unwrap(); + + if let Some(branchname) = args.value_of("branchname") { + if PathBuf::from(format!("{}/refs/branches/{}", repopath, branchname)).exists() { + anyhow::bail!("branch {} already exists", branchname); + } + + if let Some(startpoint_str) = args.value_of("start-point") { + let mut startpoint = parse_point_in_time(startpoint_str)?; + + if startpoint.lsn == 0 { + // Find end of WAL on the old timeline + let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?; + + println!( + "branching at end of WAL: {:X}/{:X}", + end_of_wal >> 32, + end_of_wal & 0xffffffff + ); + + startpoint.lsn = end_of_wal; + } + + return local_env::create_branch(local_env, branchname, startpoint); + } else { + panic!("Missing start-point"); + } + } else { + // No arguments, list branches + list_branches()?; + } + Ok(()) +} + +fn list_branches() -> Result<()> { + // list branches + let paths = fs::read_dir(zenith_repo_dir().join("refs").join("branches"))?; + + for path in paths { + println!(" {}", path?.file_name().to_str().unwrap()); + } + + Ok(()) +} + +// +// Parse user-given string that represents a point-in-time. +// +// We support multiple variants: +// +// Raw timeline id in hex, meaning the end of that timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d +// +// A specific LSN on a timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 +// +// Same, with a human-friendly branch name: +// main +// main@2/15D3DD8 +// +// Human-friendly tag name: +// mytag +// +// +fn parse_point_in_time(s: &str) -> Result { + let mut strings = s.split("@"); + let name = strings.next().unwrap(); + + let lsn: Option; + if let Some(lsnstr) = strings.next() { + let mut s = lsnstr.split("/"); + let lsn_hi: u64 = s + .next() + .ok_or(anyhow!("invalid LSN in point-in-time specification"))? + .parse()?; + let lsn_lo: u64 = s + .next() + .ok_or(anyhow!("invalid LSN in point-in-time specification"))? + .parse()?; + lsn = Some(lsn_hi << 32 | lsn_lo); + } else { + lsn = None + } + + // Check if it's a tag + if lsn.is_none() { + let tagpath = zenith_repo_dir().join("refs").join("tags").join(name); + if tagpath.exists() { + let pointstr = fs::read_to_string(tagpath)?; + + return parse_point_in_time(&pointstr); + } + } + // Check if it's a branch + // Check if it's branch @ LSN + let branchpath = zenith_repo_dir().join("refs").join("branches").join(name); + if branchpath.exists() { + let pointstr = fs::read_to_string(branchpath)?; + + let mut result = parse_point_in_time(&pointstr)?; + if lsn.is_some() { + result.lsn = lsn.unwrap(); + } else { + result.lsn = 0; + } + return Ok(result); + } + + // Check if it's a timelineid + // Check if it's timelineid @ LSN + let tlipath = zenith_repo_dir().join("timelines").join(name); + if tlipath.exists() { + let result = local_env::PointInTime { + timelineid: ZTimelineId::from_str(name)?, + lsn: lsn.unwrap_or(0), + }; + + return Ok(result); + } + + panic!("could not parse point-in-time {}", s); +} diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml new file mode 100644 index 0000000000..77bc1e9ecb --- /dev/null +++ b/zenith_utils/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "zenith_utils" +version = "0.1.0" +authors = ["Eric Seppanen "] +edition = "2018" + +[dependencies] diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs new file mode 100644 index 0000000000..2d86ad041f --- /dev/null +++ b/zenith_utils/src/lib.rs @@ -0,0 +1,2 @@ +//! zenith_utils is intended to be a place to put code that is shared +//! between other crates in this repository.