mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-10 14:10:37 +00:00
Compare commits
20 Commits
parallel_w
...
layered-ra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f5432ea1ca | ||
|
|
066e3f1c69 | ||
|
|
673c297949 | ||
|
|
e61732ca7c | ||
|
|
cb4a8396fb | ||
|
|
c77e30116e | ||
|
|
9d369f158c | ||
|
|
6ecd442fb9 | ||
|
|
f3f059c1f8 | ||
|
|
8388e14bbd | ||
|
|
5293e183c5 | ||
|
|
93ff5f7ff0 | ||
|
|
41dce68bdd | ||
|
|
7dece8e4a0 | ||
|
|
37c85d5fd9 | ||
|
|
6094236171 | ||
|
|
bb5aba42eb | ||
|
|
450fb9eafe | ||
|
|
557e3024cd | ||
|
|
bd34d7ecfc |
377
Cargo.lock
generated
377
Cargo.lock
generated
@@ -41,6 +41,20 @@ version = "1.0.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5443ccbb270374a2b1055fc72da40e1f237809cd6bb0e97e66d264cd138473a6"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"zstd",
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.51"
|
||||
@@ -54,17 +68,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "attohttpc"
|
||||
version = "0.17.0"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8bda305457262b339322106c776e3fd21df860018e566eb6a5b1aa4b6ae02d"
|
||||
checksum = "e69e13a99a7e6e070bb114f7ff381e58c7ccc188630121fc4c2fe4bcf24cd072"
|
||||
dependencies = [
|
||||
"http",
|
||||
"log",
|
||||
"native-tls",
|
||||
"openssl",
|
||||
"rustls 0.20.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"url",
|
||||
"webpki 0.22.0",
|
||||
"webpki-roots 0.22.1",
|
||||
"wildmatch",
|
||||
]
|
||||
|
||||
@@ -111,9 +126,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-creds"
|
||||
version = "0.26.2"
|
||||
version = "0.27.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5e1c8f64305d3f3096cb247983a3cae16f8c2960129699bcb70639e31180794"
|
||||
checksum = "460a75eac8f3cb7683e0a9a588a83c3ff039331ea7bfbfbfcecf1dacab276e11"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"attohttpc",
|
||||
@@ -248,6 +263,9 @@ name = "cc"
|
||||
version = "1.0.71"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79c2681d6594606957bbb8631c4b90a7fcaaa72cdb714743a437b156d6a7eedd"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -328,43 +346,21 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"nix",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"rand",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"url",
|
||||
"walkeeper",
|
||||
"workspace_hack",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6888e10551bb93e424d8df1d07f1a8b4fceb0001a3a4b048bfc47554946f47b3"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.1"
|
||||
@@ -497,21 +493,6 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.0.1"
|
||||
@@ -540,9 +521,9 @@ checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca"
|
||||
checksum = "8cd0210d8c325c245ff06fd95a3b13689a1a276ac8cfa8e8720cb840bfb84b9e"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@@ -555,9 +536,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888"
|
||||
checksum = "7fc8cd39e3dbf865f7340dce6a2d401d24fd37c6fe6c4f0ee0de8bfca2252d27"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@@ -565,15 +546,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d"
|
||||
checksum = "629316e42fe7c2a0b9a65b47d159ceaa5453ab14e8f0a3c5eedbb8cd55b4a445"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c"
|
||||
checksum = "7b808bf53348a36cab739d7e04755909b9fcaaa69b7d7e588b37b6ec62704c97"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@@ -582,18 +563,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377"
|
||||
checksum = "e481354db6b5c353246ccf6a728b0c5511d752c08da7260546fc0933869daa11"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb"
|
||||
checksum = "a89f17b21645bc4ed773c69af9c9a0effd4a3f1a3876eadd453469f8854e7fdd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
@@ -601,23 +580,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11"
|
||||
checksum = "996c6442437b62d21a32cd9906f9c41e7dc1e19a9579843fad948696769305af"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99"
|
||||
checksum = "dabf1872aaab32c886832f2276d2f5399887e2bd613698a02359e4ea83f8de12"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481"
|
||||
checksum = "41d22213122356472061ac0f1ab2cee28d2bac8491410fd68c2af53d1cedb83e"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
@@ -627,8 +605,6 @@ dependencies = [
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"proc-macro-hack",
|
||||
"proc-macro-nested",
|
||||
"slab",
|
||||
]
|
||||
|
||||
@@ -830,16 +806,18 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.5.0"
|
||||
name = "hyper-rustls"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||
checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"hyper",
|
||||
"native-tls",
|
||||
"log",
|
||||
"rustls 0.19.1",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls",
|
||||
"webpki 0.21.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -884,6 +862,15 @@ version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.55"
|
||||
@@ -1017,15 +1004,6 @@ version = "0.3.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "minidom"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "332592c2149fc7dd40a64fc9ef6f0d65607284b474cef9817d1fc8c7e7b3608e"
|
||||
dependencies = [
|
||||
"quick-xml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.7.13"
|
||||
@@ -1048,24 +1026,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.0"
|
||||
@@ -1152,39 +1112,6 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d9facdb76fec0b73c406f125d44d86fdad818d66fef0531eec9233ca425ff4a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.67"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69df2d8dfc6ce3aaf44b40dec6f487d5a886516cf6879c49e98e0710f310a058"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.3.1"
|
||||
@@ -1200,6 +1127,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bookfile",
|
||||
"byteorder",
|
||||
@@ -1320,12 +1248,6 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb"
|
||||
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.1"
|
||||
@@ -1402,12 +1324,6 @@ version = "0.5.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-nested"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.30"
|
||||
@@ -1442,7 +1358,7 @@ dependencies = [
|
||||
"md5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"rustls 0.19.1",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
@@ -1450,15 +1366,6 @@ dependencies = [
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.10"
|
||||
@@ -1582,24 +1489,25 @@ dependencies = [
|
||||
"http",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-tls",
|
||||
"hyper-rustls",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.19.1",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
"webpki-roots 0.21.1",
|
||||
"winreg",
|
||||
]
|
||||
|
||||
@@ -1643,9 +1551,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rust-s3"
|
||||
version = "0.27.0"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2f26775d15f43dc848ef0ec65f83de8775549e486c7a3a576652049a7122d32"
|
||||
checksum = "18c58d4682844a5d6301efbf915dd7a9d3d638006f9bb821527a0bbbf2a4cfc2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -1654,14 +1562,12 @@ dependencies = [
|
||||
"base64 0.13.0",
|
||||
"cfg-if",
|
||||
"chrono",
|
||||
"futures",
|
||||
"hex",
|
||||
"hmac 0.11.0",
|
||||
"http",
|
||||
"log",
|
||||
"maybe-async",
|
||||
"md5",
|
||||
"minidom",
|
||||
"percent-encoding",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -1697,8 +1603,20 @@ dependencies = [
|
||||
"base64 0.13.0",
|
||||
"log",
|
||||
"ring",
|
||||
"sct",
|
||||
"webpki",
|
||||
"sct 0.6.1",
|
||||
"webpki 0.21.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d37e5e2290f3e040b594b1a9e04377c2c671f1a1cfd9bfdef82106ac1c113f84"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring",
|
||||
"sct 0.7.0",
|
||||
"webpki 0.22.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1707,7 +1625,7 @@ version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fb079b52cfdb005752b7c3c646048e702003576a8321058e4c8b38227c11aa6"
|
||||
dependencies = [
|
||||
"rustls",
|
||||
"rustls 0.19.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1725,16 +1643,6 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.1.0"
|
||||
@@ -1752,26 +1660,13 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "2.3.1"
|
||||
name = "sct"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "23a2ac85147a3a11d77ecf1bc7166ec0b92febfa4461c37944e180f319ece467"
|
||||
checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework-sys"
|
||||
version = "2.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9dd14d83160b528b7bfd66439110573efcfbe281b17fc2ca9f39f550d619c7e"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"ring",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2113,16 +2008,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.1"
|
||||
@@ -2145,6 +2030,17 @@ dependencies = [
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6"
|
||||
dependencies = [
|
||||
"rustls 0.19.1",
|
||||
"tokio",
|
||||
"webpki 0.21.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-stream"
|
||||
version = "0.1.7"
|
||||
@@ -2303,12 +2199,6 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "vec_map"
|
||||
version = "0.8.2"
|
||||
@@ -2337,7 +2227,6 @@ name = "walkeeper"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap",
|
||||
@@ -2349,8 +2238,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
@@ -2362,7 +2249,8 @@ dependencies = [
|
||||
"signal-hook",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-postgres",
|
||||
"tracing",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
@@ -2471,6 +2359,34 @@ dependencies = [
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.21.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940"
|
||||
dependencies = [
|
||||
"webpki 0.21.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c475786c6f47219345717a043a37ec04cb4bc185e28853adcc4fa0a947eba630"
|
||||
dependencies = [
|
||||
"webpki 0.22.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "3.1.1"
|
||||
@@ -2482,9 +2398,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wildmatch"
|
||||
version = "1.1.0"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f44b95f62d34113cf558c93511ac93027e03e9c29a60dd0fd70e6e025c7270a"
|
||||
checksum = "d6c48bd20df7e4ced539c12f570f937c6b4884928a87fee70a479d72f031d4e0"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
@@ -2607,7 +2523,7 @@ dependencies = [
|
||||
"postgres",
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
"rustls 0.19.1",
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2617,7 +2533,36 @@ dependencies = [
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"webpki",
|
||||
"webpki 0.21.4",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.7.0+zstd.1.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9428752481d8372e15b1bf779ea518a179ad6c771cca2d2c60e4fbff3cc2cd52"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "3.1.0+zstd.1.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aa1926623ad7fe406e090555387daf73db555b948134b4d73eac5eb08fb666d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "1.5.0+zstd.1.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4e6c094340240369025fc6b731b054ee2a834328fa584310ac96aa4baebdc465"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Zenith
|
||||
|
||||
Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
|
||||
Zenith is an serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributes data across a cluster of nodes.
|
||||
|
||||
## Architecture overview
|
||||
|
||||
|
||||
@@ -7,24 +7,18 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
bytes = "1.0.1"
|
||||
nix = "0.23"
|
||||
url = "2.2.2"
|
||||
hex = { version = "0.4.3", features = ["serde"] }
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -199,17 +199,24 @@ impl PostgresNode {
|
||||
})
|
||||
}
|
||||
|
||||
fn sync_safekeepers(&self) -> Result<Lsn> {
|
||||
fn sync_safekeepers(&self, auth_token: &Option<String>) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir().join("postgres");
|
||||
let sync_handle = Command::new(pg_path)
|
||||
.arg("--sync-safekeepers")
|
||||
let mut cmd = Command::new(&pg_path);
|
||||
|
||||
cmd.arg("--sync-safekeepers")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGDATA", self.pgdata().to_str().unwrap())
|
||||
.stdout(Stdio::piped())
|
||||
// Comment this to avoid capturing stderr (useful if command hangs)
|
||||
.stderr(Stdio::piped())
|
||||
.stderr(Stdio::piped());
|
||||
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let sync_handle = cmd
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
@@ -319,8 +326,11 @@ impl PostgresNode {
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
format!("host={} port={} password={}", host, port, password)
|
||||
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
|
||||
// We parse this string and build it back with token from env var, and for simplicity rebuild
|
||||
// uses only needed variables namely host, port, user, password.
|
||||
format!("postgresql://no_user:{}@{}:{}", password, host, port)
|
||||
};
|
||||
conf.append("shared_preload_libraries", "zenith");
|
||||
conf.append_line("");
|
||||
@@ -358,7 +368,7 @@ impl PostgresNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_basebackup(&self) -> Result<()> {
|
||||
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if self.uses_wal_proposer {
|
||||
@@ -366,7 +376,7 @@ impl PostgresNode {
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers()?;
|
||||
let lsn = self.sync_safekeepers(auth_token)?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
@@ -417,7 +427,6 @@ impl PostgresNode {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
@@ -451,7 +460,7 @@ impl PostgresNode {
|
||||
fs::write(&postgresql_conf_path, postgresql_conf)?;
|
||||
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup()?;
|
||||
self.load_basebackup(auth_token)?;
|
||||
|
||||
if self.lsn.is_some() {
|
||||
File::create(self.pgdata().join("standby.signal"))?;
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::{opt_display_serde, ZTenantId};
|
||||
|
||||
//
|
||||
// This data structures represents zenith CLI config
|
||||
@@ -46,7 +46,7 @@ pub struct LocalEnv {
|
||||
|
||||
// Default tenant ID to use with the 'zenith' command line utility, when
|
||||
// --tenantid is not explicitly specified.
|
||||
#[serde(with = "opt_tenantid_serde")]
|
||||
#[serde(with = "opt_display_serde")]
|
||||
#[serde(default)]
|
||||
pub default_tenantid: Option<ZTenantId>,
|
||||
|
||||
@@ -325,30 +325,3 @@ fn base_path() -> PathBuf {
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
|
||||
mod opt_tenantid_serde {
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
tenantid.map(|t| t.to_string()).serialize(ser)
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = Option::deserialize(des)?;
|
||||
if let Some(s) = s {
|
||||
return Ok(Some(
|
||||
ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
|
||||
));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,13 +15,11 @@ use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::read_pidfile;
|
||||
use crate::storage::PageServerNode;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -116,17 +114,6 @@ impl SafekeeperNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
// Configure connection to page server
|
||||
//
|
||||
// FIXME: We extract the host and port from the connection string instead of using
|
||||
// the connection string directly, because the 'safekeeper' binary expects
|
||||
// host:port format. That's a bit silly when we already have a full libpq connection
|
||||
// string at hand.
|
||||
let pageserver_conn = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
format!("{}:{}", host, port)
|
||||
};
|
||||
|
||||
let listen_pg = format!("localhost:{}", self.conf.pg_port);
|
||||
let listen_http = format!("localhost:{}", self.conf.http_port);
|
||||
|
||||
@@ -134,7 +121,6 @@ impl SafekeeperNode {
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.args(&["--pageserver", &pageserver_conn])
|
||||
.args(&["--recall", "1 second"])
|
||||
.arg("--daemonize")
|
||||
.env_clear()
|
||||
@@ -143,10 +129,6 @@ impl SafekeeperNode {
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
|
||||
}
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
|
||||
@@ -40,7 +40,8 @@ url = "2"
|
||||
nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
|
||||
async-compression = {version = "0.3", features = ["zstd", "tokio"]}
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -9,7 +9,7 @@ The Page Server has a few different duties:
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
keeps track of WAL records which are not synced to S3 yet.
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
|
||||
@@ -559,13 +559,17 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
let mut threads = Vec::new();
|
||||
|
||||
if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
|
||||
let sync_startup = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
if let Some(handle) = sync_startup.sync_loop_handle {
|
||||
threads.push(handle);
|
||||
}
|
||||
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
@@ -21,10 +21,10 @@ use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{repository::Repository, PageServerConf};
|
||||
use crate::{repository::RepositoryTimeline, tenant_mgr};
|
||||
use crate::{restore_local_repo, LOG_FILE_NAME};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -54,7 +54,12 @@ impl BranchInfo {
|
||||
.to_string();
|
||||
let timeline_id = std::fs::read_to_string(path)?.parse::<ZTimelineId>()?;
|
||||
|
||||
let timeline = repo.get_timeline(timeline_id)?;
|
||||
let timeline = match repo.get_timeline(timeline_id)? {
|
||||
RepositoryTimeline::Local(local_entry) => local_entry,
|
||||
RepositoryTimeline::Remote(_) => {
|
||||
bail!("Timeline {} is remote, no branches to display", timeline_id)
|
||||
}
|
||||
};
|
||||
|
||||
// we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
|
||||
let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
|
||||
@@ -149,7 +154,7 @@ pub fn create_repo(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
@@ -297,7 +302,10 @@ pub(crate) fn create_branch(
|
||||
}
|
||||
|
||||
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
|
||||
let timeline = repo.get_timeline(startpoint.timelineid)?;
|
||||
let timeline = repo
|
||||
.get_timeline(startpoint.timelineid)?
|
||||
.local_timeline()
|
||||
.ok_or_else(|| anyhow!("Cannot branch off the timeline that's not present locally"))?;
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = timeline.get_last_record_lsn();
|
||||
|
||||
@@ -17,6 +17,98 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
/v1/timeline/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: List tenant timelines
|
||||
responses:
|
||||
"200":
|
||||
description: array of brief timeline descriptions
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
# currently, just a timeline id string, but when remote index gets to be accessed
|
||||
# remote/local timeline field would be added at least
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/timeline/{tenant_id}/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get timeline info for tenant's remote timeline
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no branch name
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/branch/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -284,6 +376,36 @@ components:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
- timeline_id
|
||||
- tenant_id
|
||||
- last_record_lsn
|
||||
- prev_record_lsn
|
||||
- start_lsn
|
||||
- disk_consistent_lsn
|
||||
properties:
|
||||
timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
last_record_lsn:
|
||||
type: string
|
||||
prev_record_lsn:
|
||||
type: string
|
||||
start_lsn:
|
||||
type: string
|
||||
disk_consistent_lsn:
|
||||
type: string
|
||||
timeline_state:
|
||||
type: string
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use routerify::{ext::RequestExt, RouterBuilder};
|
||||
use serde::Serialize;
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
@@ -18,10 +19,13 @@ use zenith_utils::http::{
|
||||
request::get_request_param,
|
||||
request::parse_request_param,
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{opt_display_serde, ZTimelineId};
|
||||
|
||||
use super::models::BranchCreateRequest;
|
||||
use super::models::TenantCreateRequest;
|
||||
use crate::branches::BranchInfo;
|
||||
use crate::repository::TimelineSyncState;
|
||||
use crate::{branches, tenant_mgr, PageServerConf, ZTenantId};
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -140,6 +144,97 @@ async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let conf = get_state(&request).conf;
|
||||
let timelines_dir = conf.timelines_path(&tenant_id);
|
||||
|
||||
let mut timelines_dir_contents =
|
||||
tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to list timelines dir '{}' contents",
|
||||
timelines_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut local_timelines = Vec::new();
|
||||
while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to list timelines dir '{}' contents",
|
||||
timelines_dir.display()
|
||||
)
|
||||
})? {
|
||||
let entry_path = entry.path();
|
||||
let entry_type = entry.file_type().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to get file type of timeline dirs' entry '{}'",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
if entry_type.is_dir() {
|
||||
match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
|
||||
Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
|
||||
Err(e) => error!(
|
||||
"Failed to get parse timeline id from timeline dirs' entry '{}': {}",
|
||||
entry_path.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(json_response(StatusCode::OK, local_timelines)?)
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct TimelineInfo {
|
||||
#[serde(with = "hex")]
|
||||
timeline_id: ZTimelineId,
|
||||
#[serde(with = "hex")]
|
||||
tenant_id: ZTenantId,
|
||||
#[serde(with = "opt_display_serde")]
|
||||
ancestor_timeline_id: Option<ZTimelineId>,
|
||||
last_record_lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
start_lsn: Lsn,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_state: Option<TimelineSyncState>,
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter =
|
||||
info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
|
||||
.entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
match repo.get_timeline(timeline_id)?.local_timeline() {
|
||||
None => bail!("Timeline with id {} is not present locally", timeline_id),
|
||||
Some(timeline) => Ok::<_, anyhow::Error>(TimelineInfo {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
|
||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||
last_record_lsn: timeline.get_last_record_lsn(),
|
||||
prev_record_lsn: timeline.get_prev_record_lsn(),
|
||||
start_lsn: timeline.get_start_lsn(),
|
||||
timeline_state: repo.get_timeline_state(timeline_id),
|
||||
}),
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
@@ -196,6 +291,11 @@ pub fn make_router(
|
||||
router
|
||||
.data(Arc::new(State::new(conf, auth)))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/timeline/:tenant_id", timeline_list_handler)
|
||||
.get(
|
||||
"/v1/timeline/:tenant_id/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.get("/v1/branch/:tenant_id", branch_list_handler)
|
||||
.get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
|
||||
.post("/v1/branch", branch_create_handler)
|
||||
|
||||
@@ -27,15 +27,18 @@ use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::ops::{Bound::Included, Deref};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{self, AtomicUsize};
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicUsize};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::{metadata_path, TimelineMetadata};
|
||||
use crate::page_cache;
|
||||
use crate::relish::*;
|
||||
use crate::remote_storage::schedule_timeline_checkpoint_upload;
|
||||
use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
|
||||
use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
|
||||
use crate::repository::{
|
||||
GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter,
|
||||
WALRecord,
|
||||
};
|
||||
use crate::tenant_mgr;
|
||||
use crate::walreceiver;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
@@ -118,7 +121,6 @@ lazy_static! {
|
||||
}
|
||||
|
||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
///
|
||||
@@ -127,20 +129,25 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
pub struct LayeredRepository {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelines: Mutex<HashMap<ZTimelineId, Arc<LayeredTimeline>>>,
|
||||
timelines: Mutex<HashMap<ZTimelineId, LayeredTimelineEntry>>,
|
||||
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
/// Makes evey repo's timelines to backup their files to remote storage,
|
||||
/// when they get frozen.
|
||||
/// Makes every timeline to backup their files to remote storage.
|
||||
upload_relishes: bool,
|
||||
}
|
||||
|
||||
/// Public interface
|
||||
impl Repository for LayeredRepository {
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
Ok(self.get_timeline_locked(timelineid, &mut timelines)?)
|
||||
Ok(
|
||||
match self.get_or_init_timeline(timelineid, &mut timelines)? {
|
||||
LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local),
|
||||
LayeredTimelineEntry::Remote(remote_timeline_id) => {
|
||||
RepositoryTimeline::Remote(remote_timeline_id)
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn create_empty_timeline(
|
||||
@@ -164,11 +171,11 @@ impl Repository for LayeredRepository {
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
0,
|
||||
false,
|
||||
)?;
|
||||
self.upload_relishes,
|
||||
);
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
let r = timelines.insert(timelineid, timeline_rc.clone());
|
||||
let r = timelines.insert(timelineid, LayeredTimelineEntry::Local(timeline_rc.clone()));
|
||||
assert!(r.is_none());
|
||||
Ok(timeline_rc)
|
||||
}
|
||||
@@ -179,7 +186,12 @@ impl Repository for LayeredRepository {
|
||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||
// concurrently removes data that is needed by the new timeline.
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let src_timeline = self.get_timeline_locked(src, &mut timelines)?;
|
||||
let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? {
|
||||
LayeredTimelineEntry::Local(timeline) => timeline,
|
||||
LayeredTimelineEntry::Remote(_) => {
|
||||
bail!("Cannot branch off the timeline {} that's not local", src)
|
||||
}
|
||||
};
|
||||
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn)
|
||||
@@ -243,17 +255,22 @@ impl Repository for LayeredRepository {
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
// checkpoint runs.
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_checkpoint: Vec<(ZTimelineId, Arc<LayeredTimeline>)> = timelines
|
||||
let timelines_to_checkpoint = timelines
|
||||
.iter()
|
||||
.map(|(timelineid, timeline)| (*timelineid, timeline.clone()))
|
||||
.collect();
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
|
||||
for (timelineid, timeline) in timelines_to_checkpoint.iter() {
|
||||
for (timelineid, timeline) in &timelines_to_checkpoint {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered();
|
||||
|
||||
timeline.checkpoint(cconf)?;
|
||||
match timeline {
|
||||
LayeredTimelineEntry::Local(timeline) => timeline.checkpoint(cconf)?,
|
||||
LayeredTimelineEntry::Remote(_) => debug!(
|
||||
"Cannot run the checkpoint for remote timeline {}",
|
||||
timelineid
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -265,104 +282,177 @@ impl Repository for LayeredRepository {
|
||||
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for (timelineid, timeline) in timelines.iter() {
|
||||
shutdown_timeline(*timelineid, timeline.as_ref())?;
|
||||
shutdown_timeline(self.tenantid, *timelineid, timeline)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let removed_timeline = match timelines.remove(&timeline_id) {
|
||||
Some(timeline) => timeline,
|
||||
None => {
|
||||
warn!("Timeline {} not found, nothing to remove", timeline_id);
|
||||
return Ok(());
|
||||
// TODO this method currentlly does not do anything to prevent (or react to) state updates between a sync task schedule and a sync task end (that causes this update).
|
||||
// Sync task is enqueued and can error and be rescheduled, so some significant time may pass between the events.
|
||||
//
|
||||
/// Reacts on the timeline sync state change, changing pageserver's memory state for this timeline (unload or load of the timeline files).
|
||||
fn set_timeline_state(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
new_state: TimelineSyncState,
|
||||
) -> Result<()> {
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
|
||||
let timeline_to_shutdown = match new_state {
|
||||
TimelineSyncState::Ready => {
|
||||
let reloaded_timeline =
|
||||
self.init_local_timeline(timeline_id, &mut timelines_accessor)?;
|
||||
timelines_accessor
|
||||
.insert(timeline_id, LayeredTimelineEntry::Local(reloaded_timeline));
|
||||
None
|
||||
}
|
||||
TimelineSyncState::Evicted => timelines_accessor.remove(&timeline_id),
|
||||
TimelineSyncState::AwaitsDownload | TimelineSyncState::CloudOnly => {
|
||||
timelines_accessor.insert(timeline_id, LayeredTimelineEntry::Remote(timeline_id))
|
||||
}
|
||||
};
|
||||
drop(timelines);
|
||||
shutdown_timeline(timeline_id, removed_timeline.as_ref())?;
|
||||
drop(timelines_accessor);
|
||||
|
||||
if let Some(timeline) = timeline_to_shutdown {
|
||||
shutdown_timeline(self.tenantid, timeline_id, &timeline)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Layered repo does not store anything but
|
||||
/// * local, fully loaded timelines, ready for usage
|
||||
/// * remote timelines, that need a download task scheduled first before they can be used
|
||||
///
|
||||
/// [`TimelineSyncState::Evicted`] and other non-local and non-remote states are not stored in the layered repo at all,
|
||||
/// hence their statuses cannot be returned by the repo.
|
||||
fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option<TimelineSyncState> {
|
||||
Some(
|
||||
if self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timeline_id)?
|
||||
.local_or_schedule_download(self.tenantid)
|
||||
.is_some()
|
||||
{
|
||||
TimelineSyncState::Ready
|
||||
} else {
|
||||
TimelineSyncState::CloudOnly
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn shutdown_timeline(
|
||||
tenant_id: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &LayeredTimeline,
|
||||
timeline: &LayeredTimelineEntry,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
walreceiver::stop_wal_receiver(timelineid);
|
||||
trace!("repo shutdown. checkpoint timeline {}", timelineid);
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
//TODO Wait for walredo process to shutdown too
|
||||
if let Some(timeline) = timeline.local_or_schedule_download(tenant_id) {
|
||||
timeline
|
||||
.upload_relishes
|
||||
.store(false, atomic::Ordering::Relaxed);
|
||||
walreceiver::stop_wal_receiver(timelineid);
|
||||
trace!("repo shutdown. checkpoint timeline {}", timelineid);
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
//TODO Wait for walredo process to shutdown too
|
||||
} else {
|
||||
warn!("Skpping shutdown of a remote timeline");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum LayeredTimelineEntry {
|
||||
Local(Arc<LayeredTimeline>),
|
||||
Remote(ZTimelineId),
|
||||
}
|
||||
|
||||
impl LayeredTimelineEntry {
|
||||
fn timeline_id(&self) -> ZTimelineId {
|
||||
match self {
|
||||
LayeredTimelineEntry::Local(timeline) => timeline.timelineid,
|
||||
LayeredTimelineEntry::Remote(timeline_id) => *timeline_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets local timeline data, if it's present. Otherwise schedules a download fot the remote timeline and returns `None`.
|
||||
fn local_or_schedule_download(&self, tenant_id: ZTenantId) -> Option<&LayeredTimeline> {
|
||||
match self {
|
||||
Self::Local(local) => Some(local.as_ref()),
|
||||
Self::Remote(timeline_id) => {
|
||||
debug!(
|
||||
"Accessed a remote timeline {} for tenant {}, scheduling a timeline download",
|
||||
timeline_id, tenant_id
|
||||
);
|
||||
schedule_timeline_download(tenant_id, *timeline_id);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Private functions
|
||||
impl LayeredRepository {
|
||||
// Implementation of the public `get_timeline` function. This differs from the public
|
||||
// interface in that the caller must already hold the mutex on the 'timelines' hashmap.
|
||||
fn get_timeline_locked(
|
||||
fn get_or_init_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
timelines: &mut HashMap<ZTimelineId, Arc<LayeredTimeline>>,
|
||||
) -> Result<Arc<LayeredTimeline>> {
|
||||
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> Result<LayeredTimelineEntry> {
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
Some(timeline_entry) => {
|
||||
let _ = timeline_entry.local_or_schedule_download(self.tenantid);
|
||||
Ok(timeline_entry.clone())
|
||||
}
|
||||
None => {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)
|
||||
.context("failed to load metadata")?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
|
||||
// Recurse to look up the ancestor timeline.
|
||||
//
|
||||
// TODO: If you have a very deep timeline history, this could become
|
||||
// expensive. Perhaps delay this until we need to look up a page in
|
||||
// ancestor.
|
||||
let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline() {
|
||||
Some(self.get_timeline_locked(ancestor_timelineid, timelines)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let _enter =
|
||||
info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid)
|
||||
.entered();
|
||||
|
||||
let mut timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
metadata.clone(),
|
||||
ancestor,
|
||||
let timeline = self.init_local_timeline(timelineid, timelines)?;
|
||||
timelines.insert(
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
0, // init with 0 and update after layers are loaded,
|
||||
self.upload_relishes,
|
||||
)?;
|
||||
|
||||
// List the layers on disk, and load them into the layer map
|
||||
let loaded_layers = timeline
|
||||
.load_layer_map(disk_consistent_lsn)
|
||||
.context("failed to load layermap")?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
timelineid,
|
||||
loaded_layers,
|
||||
metadata,
|
||||
);
|
||||
}
|
||||
|
||||
// needs to be after load_layer_map
|
||||
timeline.init_current_logical_size()?;
|
||||
|
||||
let timeline = Arc::new(timeline);
|
||||
timelines.insert(timelineid, timeline.clone());
|
||||
Ok(timeline)
|
||||
LayeredTimelineEntry::Local(Arc::clone(&timeline)),
|
||||
);
|
||||
Ok(LayeredTimelineEntry::Local(timeline))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_local_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> anyhow::Result<Arc<LayeredTimeline>> {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)
|
||||
.context("failed to load metadata")?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
|
||||
let ancestor = metadata
|
||||
.ancestor_timeline()
|
||||
.map(|ancestor_timelineid| self.get_or_init_timeline(ancestor_timelineid, timelines))
|
||||
.transpose()?;
|
||||
let _enter =
|
||||
info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid)
|
||||
.entered();
|
||||
let mut timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
metadata,
|
||||
ancestor,
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
0, // init with 0 and update after layers are loaded,
|
||||
self.upload_relishes,
|
||||
);
|
||||
timeline
|
||||
.load_layer_map(disk_consistent_lsn)
|
||||
.context("failed to load layermap")?;
|
||||
timeline.init_current_logical_size()?;
|
||||
|
||||
Ok(Arc::new(timeline))
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
@@ -483,10 +573,31 @@ impl LayeredRepository {
|
||||
|
||||
//Now collect info about branchpoints
|
||||
let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new();
|
||||
for timelineid in &timelineids {
|
||||
let timeline = self.get_timeline_locked(*timelineid, &mut *timelines)?;
|
||||
for &timelineid in &timelineids {
|
||||
let timeline = match self.get_or_init_timeline(timelineid, &mut timelines)? {
|
||||
LayeredTimelineEntry::Local(timeline) => timeline,
|
||||
LayeredTimelineEntry::Remote(_) => {
|
||||
warn!(
|
||||
"Timeline {} is not local, cannot proceed with gc",
|
||||
timelineid
|
||||
);
|
||||
return Ok(totals);
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(ancestor_timeline) = &timeline.ancestor_timeline {
|
||||
let ancestor_timeline =
|
||||
match ancestor_timeline.local_or_schedule_download(self.tenantid) {
|
||||
Some(timeline) => timeline,
|
||||
None => {
|
||||
warn!(
|
||||
"Timeline {} has ancestor {} is not local, cannot proceed with gc",
|
||||
timelineid,
|
||||
ancestor_timeline.timeline_id()
|
||||
);
|
||||
return Ok(totals);
|
||||
}
|
||||
};
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timelineid) = target_timelineid {
|
||||
if ancestor_timeline.timelineid == timelineid {
|
||||
@@ -510,7 +621,13 @@ impl LayeredRepository {
|
||||
|
||||
// We have already loaded all timelines above
|
||||
// so this operation is just a quick map lookup.
|
||||
let timeline = self.get_timeline_locked(timelineid, &mut *timelines)?;
|
||||
let timeline = match self.get_or_init_timeline(timelineid, &mut *timelines)? {
|
||||
LayeredTimelineEntry::Local(timeline) => timeline,
|
||||
LayeredTimelineEntry::Remote(_) => {
|
||||
debug!("Skipping GC for non-local timeline {}", timelineid);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// If target_timeline is specified, only GC it
|
||||
if let Some(target_timelineid) = target_timelineid {
|
||||
@@ -584,7 +701,7 @@ pub struct LayeredTimeline {
|
||||
|
||||
// Parent timeline that this timeline was branched from, and the LSN
|
||||
// of the branch point.
|
||||
ancestor_timeline: Option<Arc<LayeredTimeline>>,
|
||||
ancestor_timeline: Option<LayeredTimelineEntry>,
|
||||
ancestor_lsn: Lsn,
|
||||
|
||||
// this variable indicates how much space is used from user's point of view,
|
||||
@@ -606,8 +723,8 @@ pub struct LayeredTimeline {
|
||||
// ordering for its operations, but involves private modules, and macro trickery
|
||||
current_logical_size_gauge: IntGauge,
|
||||
|
||||
/// If `true`, will backup its timeline files to remote storage after freezing.
|
||||
upload_relishes: bool,
|
||||
/// If `true`, will backup its files that appear after each checkpointing to the remote storage.
|
||||
upload_relishes: AtomicBool,
|
||||
|
||||
/// Ensures layers aren't frozen by checkpointer between
|
||||
/// [`LayeredTimeline::get_layer_for_write`] and layer reads.
|
||||
@@ -635,7 +752,9 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline.as_ref().map(|x| x.timelineid)
|
||||
self.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(LayeredTimelineEntry::timeline_id)
|
||||
}
|
||||
|
||||
/// Wait until WAL has been received up to the given LSN.
|
||||
@@ -711,7 +830,7 @@ impl Timeline for LayeredTimeline {
|
||||
|
||||
let segsize;
|
||||
if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? {
|
||||
segsize = layer.get_seg_size(lsn)?;
|
||||
segsize = layer.get_seg_size(seg, lsn)?;
|
||||
trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize);
|
||||
} else {
|
||||
if segno == 0 {
|
||||
@@ -735,7 +854,7 @@ impl Timeline for LayeredTimeline {
|
||||
|
||||
let result;
|
||||
if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? {
|
||||
result = layer.get_seg_exists(lsn)?;
|
||||
result = layer.get_seg_exists(seg, lsn)?;
|
||||
} else {
|
||||
result = false;
|
||||
}
|
||||
@@ -800,11 +919,17 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ancestor) = timeline.ancestor_timeline.as_ref() {
|
||||
timeline = ancestor;
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
match &timeline.ancestor_timeline {
|
||||
None => break,
|
||||
Some(ancestor_entry) => {
|
||||
match ancestor_entry.local_or_schedule_download(self.tenantid) {
|
||||
Some(ancestor) => {
|
||||
timeline = ancestor;
|
||||
continue;
|
||||
}
|
||||
None => bail!("Cannot list relishes for timeline {} tenant {} due to its ancestor being remote only", self.timelineid, self.tenantid),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -870,11 +995,11 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
if let Some(ancestor) = self.ancestor_timeline.as_ref() {
|
||||
ancestor.get_start_lsn()
|
||||
} else {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
self.ancestor_timeline
|
||||
.as_ref()
|
||||
.and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid))
|
||||
.map(Timeline::get_start_lsn)
|
||||
.unwrap_or(self.ancestor_lsn)
|
||||
}
|
||||
|
||||
fn get_current_logical_size(&self) -> usize {
|
||||
@@ -932,17 +1057,17 @@ impl LayeredTimeline {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<LayeredTimeline>>,
|
||||
ancestor: Option<LayeredTimelineEntry>,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
current_logical_size: usize,
|
||||
upload_relishes: bool,
|
||||
) -> Result<LayeredTimeline> {
|
||||
) -> LayeredTimeline {
|
||||
let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
.unwrap();
|
||||
let timeline = LayeredTimeline {
|
||||
LayeredTimeline {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
@@ -961,28 +1086,26 @@ impl LayeredTimeline {
|
||||
ancestor_lsn: metadata.ancestor_lsn(),
|
||||
current_logical_size: AtomicUsize::new(current_logical_size),
|
||||
current_logical_size_gauge,
|
||||
upload_relishes,
|
||||
upload_relishes: AtomicBool::new(upload_relishes),
|
||||
|
||||
write_lock: Mutex::new(()),
|
||||
|
||||
latest_gc_cutoff_lsn: AtomicLsn::from(metadata.latest_gc_cutoff_lsn()),
|
||||
initdb_lsn: metadata.initdb_lsn(),
|
||||
};
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map.
|
||||
/// Returns all timeline-related files that were found and loaded.
|
||||
///
|
||||
fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<Vec<PathBuf>> {
|
||||
fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
let mut num_layers = 0;
|
||||
let (imgfilenames, deltafilenames) =
|
||||
filename::list_files(self.conf, self.timelineid, self.tenantid)?;
|
||||
|
||||
let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
|
||||
let mut local_layers = Vec::with_capacity(imgfilenames.len() + deltafilenames.len());
|
||||
// First create ImageLayer structs for each image file.
|
||||
for filename in &imgfilenames {
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
@@ -998,7 +1121,6 @@ impl LayeredTimeline {
|
||||
let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
local_layers.push(layer.path());
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
}
|
||||
@@ -1023,13 +1145,12 @@ impl LayeredTimeline {
|
||||
let layer = DeltaLayer::new(self.conf, self.timelineid, self.tenantid, filename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
local_layers.push(layer.path());
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
}
|
||||
info!("loaded layer map with {} layers", num_layers);
|
||||
|
||||
Ok(local_layers)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -1088,7 +1209,19 @@ impl LayeredTimeline {
|
||||
|
||||
while lsn < timeline.ancestor_lsn {
|
||||
trace!("going into ancestor {} ", timeline.ancestor_lsn);
|
||||
timeline = timeline.ancestor_timeline.as_ref().unwrap();
|
||||
timeline = match timeline
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid))
|
||||
{
|
||||
Some(timeline) => timeline,
|
||||
None => {
|
||||
bail!(
|
||||
"Cannot get the whole layer for read locked: timeline {} is not present locally",
|
||||
self.timelineid
|
||||
)
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Now we have the right starting timeline for our search.
|
||||
@@ -1127,13 +1260,23 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
// If not, check if there's a layer on the ancestor timeline
|
||||
if let Some(ancestor) = &timeline.ancestor_timeline {
|
||||
lsn = timeline.ancestor_lsn;
|
||||
timeline = ancestor.as_ref();
|
||||
trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn);
|
||||
continue;
|
||||
match &timeline.ancestor_timeline {
|
||||
Some(ancestor_entry) => {
|
||||
match ancestor_entry.local_or_schedule_download(self.tenantid) {
|
||||
Some(ancestor) => {
|
||||
lsn = timeline.ancestor_lsn;
|
||||
timeline = ancestor;
|
||||
trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn);
|
||||
continue;
|
||||
}
|
||||
None => bail!(
|
||||
"Cannot get a layer for read from remote ancestor timeline {}",
|
||||
self.timelineid
|
||||
),
|
||||
}
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1345,7 +1488,10 @@ impl LayeredTimeline {
|
||||
None
|
||||
};
|
||||
|
||||
let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
|
||||
let ancestor_timelineid = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(LayeredTimelineEntry::timeline_id);
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
@@ -1363,7 +1509,7 @@ impl LayeredTimeline {
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
if self.upload_relishes {
|
||||
if self.upload_relishes.load(atomic::Ordering::Relaxed) {
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
@@ -1568,8 +1714,12 @@ impl LayeredTimeline {
|
||||
prior_lsn, self.timelineid
|
||||
);
|
||||
}
|
||||
// Now check ancestor timelines, if any
|
||||
else if let Some(ancestor) = &self.ancestor_timeline {
|
||||
// Now check ancestor timelines, if any are present locally
|
||||
else if let Some(ancestor) =
|
||||
self.ancestor_timeline.as_ref().and_then(|timeline_entry| {
|
||||
timeline_entry.local_or_schedule_download(self.tenantid)
|
||||
})
|
||||
{
|
||||
let prior_lsn = ancestor.get_last_record_lsn();
|
||||
if seg.rel.is_blocky() {
|
||||
info!(
|
||||
@@ -1717,7 +1867,7 @@ impl LayeredTimeline {
|
||||
let mut curr_lsn = lsn;
|
||||
loop {
|
||||
let result = layer_ref
|
||||
.get_page_reconstruct_data(blknum, curr_lsn, cached_lsn_opt, &mut data)
|
||||
.get_page_reconstruct_data(seg, blknum, curr_lsn, cached_lsn_opt, &mut data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get reconstruct data {} {:?} {} {} {:?}",
|
||||
@@ -1737,8 +1887,8 @@ impl LayeredTimeline {
|
||||
// We landed on the same layer again. Shouldn't happen, but if it does,
|
||||
// don't get stuck in an infinite loop.
|
||||
bail!(
|
||||
"could not find predecessor layer of segment {} at {}",
|
||||
seg.rel,
|
||||
"could not find predecessor of layer {} at {}, layer returned its own LSN",
|
||||
layer_ref.filename().display(),
|
||||
cont_lsn
|
||||
);
|
||||
}
|
||||
@@ -1748,8 +1898,8 @@ impl LayeredTimeline {
|
||||
continue;
|
||||
} else {
|
||||
bail!(
|
||||
"could not find predecessor layer of segment {} at {}",
|
||||
seg.rel,
|
||||
"could not find predecessor of layer {} at {}",
|
||||
layer_ref.filename().display(),
|
||||
cont_lsn
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5,10 +5,10 @@ use anyhow::Result;
|
||||
use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
pub offset: u64,
|
||||
pub size: usize,
|
||||
}
|
||||
|
||||
pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
|
||||
@@ -41,10 +41,10 @@ use crate::layered_repository::blob::BlobWriter;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::page_versions::PageVersions;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::waldecoder;
|
||||
use crate::walrecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
@@ -119,16 +119,16 @@ impl From<&DeltaLayer> for Summary {
|
||||
pub struct DeltaLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
//
|
||||
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
|
||||
// start is inclusive, and end is exclusive.
|
||||
//
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
|
||||
@@ -144,10 +144,21 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
page_version_metas: VecMap<(SegmentTag, u32, Lsn), BlobRange>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
relsizes: VecMap<(SegmentTag, Lsn), u32>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
|
||||
let slice = self.relsizes.slice_range((seg, Lsn(0))..=(seg, lsn));
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -182,6 +193,7 @@ impl Layer for DeltaLayer {
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
@@ -189,7 +201,7 @@ impl Layer for DeltaLayer {
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
assert!(seg.blknum_in_seg(blknum));
|
||||
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
|
||||
@@ -208,14 +220,14 @@ impl Layer for DeltaLayer {
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let minkey = (seg, blknum, Lsn(0));
|
||||
let maxkey = (seg, blknum, lsn);
|
||||
let iter = inner
|
||||
.page_version_metas
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
for ((_seg, _blknum, pv_lsn), blob_range) in iter {
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if pv_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
@@ -244,6 +256,15 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& seg.rel.is_blocky()
|
||||
&& blknum - seg.segno * RELISH_SEG_SIZE >= inner.get_seg_size(seg, lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
@@ -257,7 +278,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
@@ -266,19 +287,13 @@ impl Layer for DeltaLayer {
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let slice = inner
|
||||
.relsizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
}
|
||||
inner.get_seg_size(seg, lsn)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
assert_eq!(self.seg, seg, "range get_seg_exists not supported"); // TODO
|
||||
|
||||
// Is the requested LSN after the rel was dropped?
|
||||
if self.dropped && lsn >= self.end_lsn {
|
||||
return Ok(false);
|
||||
@@ -328,8 +343,8 @@ impl Layer for DeltaLayer {
|
||||
|
||||
println!("--- relsizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.relsizes.as_slice() {
|
||||
println!(" {}: {}", k, v);
|
||||
for ((seg, lsn), v) in inner.relsizes.as_slice() {
|
||||
println!(" {}@{}: {}", seg, lsn, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
|
||||
@@ -338,18 +353,20 @@ impl Layer for DeltaLayer {
|
||||
let book = Book::new(file)?;
|
||||
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
for ((seg, blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
write!(&mut desc, "{}", seg)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
let wal_desc = walrecord::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
@@ -400,12 +417,20 @@ impl DeltaLayer {
|
||||
dropped: bool,
|
||||
page_versions: &PageVersions,
|
||||
cutoff: Option<Lsn>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
relsizes: &[(Lsn, u32)],
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let relsizes = {
|
||||
let mut m = VecMap::default();
|
||||
for &(lsn, size) in relsizes {
|
||||
m.append((seg, lsn), size).unwrap();
|
||||
}
|
||||
m
|
||||
};
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -445,7 +470,7 @@ impl DeltaLayer {
|
||||
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.append((seg, blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -559,7 +584,7 @@ impl DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
seg: filename.start_seg,
|
||||
start_lsn: filename.start_lsn,
|
||||
end_lsn: filename.end_lsn,
|
||||
dropped: filename.dropped,
|
||||
@@ -601,7 +626,11 @@ impl DeltaLayer {
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_seg: self.seg,
|
||||
end_seg: SegmentTag {
|
||||
rel: self.seg.rel,
|
||||
segno: self.seg.segno + 1,
|
||||
},
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::layered_repository::storage_layer::SegmentTag;
|
||||
use crate::relish::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use std::fmt;
|
||||
use std::fmt::{self, Write};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -15,10 +15,109 @@ use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
|
||||
fn parse_seg(input: &mut &str) -> Option<SegmentTag> {
|
||||
let rel = if let Some(rest) = input.strip_prefix("rel_") {
|
||||
let mut parts = rest.splitn(5, '_');
|
||||
let rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
*input = parts.next()?;
|
||||
debug_assert!(parts.next().is_none());
|
||||
rel
|
||||
} else if let Some(rest) = input.strip_prefix("pg_xact_") {
|
||||
let (segno, rest) = rest.split_once('_')?;
|
||||
*input = rest;
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(segno, 16).ok()?,
|
||||
}
|
||||
} else if let Some(rest) = input.strip_prefix("pg_multixact_members_") {
|
||||
let (segno, rest) = rest.split_once('_')?;
|
||||
*input = rest;
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(segno, 16).ok()?,
|
||||
}
|
||||
} else if let Some(rest) = input.strip_prefix("pg_multixact_offsets_") {
|
||||
let (segno, rest) = rest.split_once('_')?;
|
||||
*input = rest;
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(segno, 16).ok()?,
|
||||
}
|
||||
} else if let Some(rest) = input.strip_prefix("pg_filenodemap_") {
|
||||
let mut parts = rest.splitn(3, '_');
|
||||
let rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
*input = parts.next()?;
|
||||
debug_assert!(parts.next().is_none());
|
||||
rel
|
||||
} else if let Some(rest) = input.strip_prefix("pg_twophase_") {
|
||||
let (xid, rest) = rest.split_once('_')?;
|
||||
*input = rest;
|
||||
RelishTag::TwoPhase {
|
||||
xid: xid.parse::<u32>().ok()?,
|
||||
}
|
||||
} else if let Some(rest) = input.strip_prefix("pg_control_checkpoint_") {
|
||||
*input = rest;
|
||||
RelishTag::Checkpoint
|
||||
} else if let Some(rest) = input.strip_prefix("pg_control_") {
|
||||
*input = rest;
|
||||
RelishTag::ControlFile
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
|
||||
let (segno, rest) = input.split_once('_')?;
|
||||
*input = rest;
|
||||
|
||||
Some(SegmentTag {
|
||||
rel,
|
||||
segno: segno.parse().ok()?,
|
||||
})
|
||||
}
|
||||
|
||||
fn write_seg(seg: &SegmentTag) -> String {
|
||||
let mut s = match seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
|
||||
write!(&mut s, "_{}", seg.segno).unwrap();
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub start_seg: SegmentTag,
|
||||
pub end_seg: SegmentTag,
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
pub dropped: bool,
|
||||
@@ -38,59 +137,12 @@ impl DeltaFileName {
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
let mut rest = fname;
|
||||
let start_seg = parse_seg(&mut rest)?;
|
||||
let end_seg = parse_seg(&mut rest)?;
|
||||
debug_assert!(start_seg < end_seg);
|
||||
|
||||
let mut parts = rest.split('_');
|
||||
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
@@ -107,7 +159,8 @@ impl DeltaFileName {
|
||||
}
|
||||
|
||||
Some(DeltaFileName {
|
||||
seg,
|
||||
start_seg,
|
||||
end_seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
@@ -117,36 +170,14 @@ impl DeltaFileName {
|
||||
|
||||
impl fmt::Display for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
let start_seg = write_seg(&self.start_seg);
|
||||
let end_seg = write_seg(&self.end_seg);
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}_{:016X}{}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
start_seg,
|
||||
end_seg,
|
||||
u64::from(self.start_lsn),
|
||||
u64::from(self.end_lsn),
|
||||
if self.dropped { "_DROPPED" } else { "" }
|
||||
@@ -156,7 +187,8 @@ impl fmt::Display for DeltaFileName {
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct ImageFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub start_seg: SegmentTag,
|
||||
pub end_seg: SegmentTag,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -171,103 +203,31 @@ impl ImageFileName {
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
let mut rest = fname;
|
||||
let start_seg = parse_seg(&mut rest)?;
|
||||
let end_seg = parse_seg(&mut rest)?;
|
||||
debug_assert!(start_seg < end_seg);
|
||||
|
||||
if rest.contains('_') {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
let lsn = Lsn::from_hex(rest).ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
|
||||
let lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
if parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(ImageFileName { seg, lsn })
|
||||
Some(ImageFileName {
|
||||
start_seg,
|
||||
end_seg,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
let start_seg = write_seg(&self.start_seg);
|
||||
let end_seg = write_seg(&self.end_seg);
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
u64::from(self.lsn),
|
||||
)
|
||||
write!(f, "{}_{}_{:016X}", start_seg, end_seg, u64::from(self.lsn),)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
//!
|
||||
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//!
|
||||
use crate::layered_repository::blob::read_blob;
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentTag,
|
||||
@@ -39,18 +40,20 @@ use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::blob::BlobRange;
|
||||
|
||||
// Magic constant to identify a Zenith segment image file
|
||||
pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
|
||||
|
||||
/// Contains each block in block # order
|
||||
const BLOCKY_IMAGES_CHAPTER: u64 = 1;
|
||||
const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
|
||||
const BLOB_CHAPTER: u64 = 4;
|
||||
const META_CHAPTER: u64 = 5;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
const SUMMARY_CHAPTER: u64 = 3;
|
||||
@@ -87,28 +90,31 @@ const BLOCK_SIZE: usize = 8192;
|
||||
///
|
||||
pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
pub lsn: Lsn,
|
||||
lsn: Lsn,
|
||||
|
||||
inner: Mutex<ImageLayerInner>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum ImageType {
|
||||
Blocky { num_blocks: u32 },
|
||||
NonBlocky,
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If None, the 'image_type' has not been loaded into memory yet.
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
meta: VecMap<SegmentTag, BlobRange>,
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
fn get_seg_blob_range(&self, seg: SegmentTag) -> Result<BlobRange> {
|
||||
self.meta
|
||||
.as_slice()
|
||||
.binary_search_by_key(&&seg, |(seg, _meta)| seg)
|
||||
.map(|idx| self.meta.as_slice()[idx].1.clone())
|
||||
.map_err(|_| anyhow!("segment not found in ImageLayer"))
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -144,6 +150,7 @@ impl Layer for ImageLayer {
|
||||
/// Look up given page in the file
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
@@ -160,33 +167,29 @@ impl Layer for ImageLayer {
|
||||
|
||||
let base_blknum = blknum % RELISH_SEG_SIZE;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
if base_blknum >= *num_blocks {
|
||||
return Ok(PageReconstructResult::Missing(lsn));
|
||||
}
|
||||
let blob_range = inner.get_seg_blob_range(seg)?;
|
||||
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * base_blknum as u64;
|
||||
let chapter = inner.book.as_ref().unwrap().chapter_reader(BLOB_CHAPTER)?;
|
||||
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
chapter.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(base_blknum == 0);
|
||||
inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
|
||||
.into_vec()
|
||||
let buf = if seg.rel.is_blocky() {
|
||||
// Check if the request is beyond EOF
|
||||
if base_blknum >= get_num_blocks(&blob_range) {
|
||||
return Ok(PageReconstructResult::Missing(lsn));
|
||||
}
|
||||
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
|
||||
let block_offset = BLOCK_SIZE as u64 * base_blknum as u64;
|
||||
assert!(block_offset + BLOCK_SIZE as u64 <= blob_range.size as u64);
|
||||
|
||||
let offset = blob_range.offset + block_offset;
|
||||
|
||||
chapter.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
buf
|
||||
} else {
|
||||
ensure!(base_blknum == 0);
|
||||
read_blob(&chapter, &blob_range)?
|
||||
};
|
||||
|
||||
reconstruct_data.page_img = Some(Bytes::from(buf));
|
||||
@@ -194,17 +197,26 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Get size of the segment
|
||||
fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
|
||||
let inner = self.load()?;
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => Ok(num_blocks),
|
||||
ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
|
||||
fn get_seg_size(&self, seg: SegmentTag, _lsn: Lsn) -> Result<u32> {
|
||||
if !self.seg.rel.is_blocky() {
|
||||
bail!("get_seg_size called for non-blocky segment");
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let blob_range = inner.get_seg_blob_range(seg)?;
|
||||
Ok(get_num_blocks(&blob_range))
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
|
||||
Ok(true)
|
||||
fn get_seg_exists(&self, seg: SegmentTag, _lsn: Lsn) -> Result<bool> {
|
||||
let inner = self.load()?;
|
||||
|
||||
Ok(inner
|
||||
.meta
|
||||
.as_slice()
|
||||
.binary_search_by_key(&&seg, |(seg, _meta)| seg)
|
||||
.is_ok())
|
||||
}
|
||||
|
||||
fn unload(&self) -> Result<()> {
|
||||
@@ -234,15 +246,11 @@ impl Layer for ImageLayer {
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
|
||||
ImageType::NonBlocky => {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
println!("non-blocky ({} bytes)", chapter.len());
|
||||
for (seg, blob_range) in inner.meta.as_slice() {
|
||||
if seg.rel.is_blocky() {
|
||||
println!("{} ({}) blocks ", seg, get_num_blocks(blob_range));
|
||||
} else {
|
||||
println!("{} non-blocky ({} bytes)", seg, blob_range.size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -274,15 +282,7 @@ impl ImageLayer {
|
||||
lsn: Lsn,
|
||||
base_images: Vec<Bytes>,
|
||||
) -> Result<ImageLayer> {
|
||||
let image_type = if seg.rel.is_blocky() {
|
||||
let num_blocks: u32 = base_images.len().try_into()?;
|
||||
ImageType::Blocky { num_blocks }
|
||||
} else {
|
||||
assert_eq!(base_images.len(), 1);
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
|
||||
let layer = ImageLayer {
|
||||
let mut layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
@@ -290,10 +290,9 @@ impl ImageLayer {
|
||||
lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: image_type.clone(),
|
||||
meta: VecMap::default(),
|
||||
}),
|
||||
};
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
//
|
||||
@@ -308,22 +307,33 @@ impl ImageLayer {
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
let book = match &image_type {
|
||||
ImageType::Blocky { .. } => {
|
||||
let mut chapter = book.new_chapter(BLOCKY_IMAGES_CHAPTER);
|
||||
for block_bytes in base_images {
|
||||
assert_eq!(block_bytes.len(), BLOCK_SIZE);
|
||||
chapter.write_all(&block_bytes)?;
|
||||
}
|
||||
chapter.close()?
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
let mut chapter = book.new_chapter(NONBLOCKY_IMAGE_CHAPTER);
|
||||
chapter.write_all(&base_images[0])?;
|
||||
chapter.close()?
|
||||
let mut blob_chapter = book.new_chapter(BLOB_CHAPTER);
|
||||
|
||||
let size = if seg.rel.is_blocky() {
|
||||
for block_bytes in &base_images {
|
||||
assert_eq!(block_bytes.len(), BLOCK_SIZE);
|
||||
blob_chapter.write_all(block_bytes)?;
|
||||
}
|
||||
BLOCK_SIZE * base_images.len()
|
||||
} else {
|
||||
assert_eq!(base_images.len(), 1);
|
||||
blob_chapter.write_all(&base_images[0])?;
|
||||
base_images[0].len()
|
||||
};
|
||||
|
||||
let book = blob_chapter.close()?;
|
||||
|
||||
let inner = layer.inner.get_mut().unwrap();
|
||||
|
||||
inner
|
||||
.meta
|
||||
.append(seg, BlobRange { offset: 0, size })
|
||||
.unwrap();
|
||||
|
||||
let mut meta_chapter = book.new_chapter(META_CHAPTER);
|
||||
inner.meta.ser_into(&mut meta_chapter)?;
|
||||
let book = meta_chapter.close()?;
|
||||
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid,
|
||||
@@ -341,8 +351,6 @@ impl ImageLayer {
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
@@ -354,13 +362,14 @@ impl ImageLayer {
|
||||
src: &dyn Layer,
|
||||
lsn: Lsn,
|
||||
) -> Result<ImageLayer> {
|
||||
// TODO needs to become an image of all segments in the layer
|
||||
let seg = src.get_seg_tag();
|
||||
let timelineid = timeline.timelineid;
|
||||
|
||||
let startblk;
|
||||
let size;
|
||||
if seg.rel.is_blocky() {
|
||||
size = src.get_seg_size(lsn)?;
|
||||
size = src.get_seg_size(seg, lsn)?;
|
||||
startblk = seg.segno * RELISH_SEG_SIZE;
|
||||
} else {
|
||||
size = 1;
|
||||
@@ -430,22 +439,13 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let images_len = chapter.len();
|
||||
ensure!(images_len % BLOCK_SIZE as u64 == 0);
|
||||
let num_blocks: u32 = (images_len / BLOCK_SIZE as u64).try_into()?;
|
||||
ImageType::Blocky { num_blocks }
|
||||
} else {
|
||||
let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let meta = VecMap::des(&book.read_chapter(META_CHAPTER)?)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
book: Some(book),
|
||||
image_type,
|
||||
meta,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
@@ -462,11 +462,11 @@ impl ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
seg: filename.start_seg,
|
||||
lsn: filename.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
meta: VecMap::default(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -489,14 +489,18 @@ impl ImageLayer {
|
||||
lsn: summary.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
meta: VecMap::default(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
start_seg: self.seg,
|
||||
end_seg: SegmentTag {
|
||||
rel: self.seg.rel,
|
||||
segno: self.seg.segno + 1,
|
||||
},
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
@@ -511,3 +515,9 @@ impl ImageLayer {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Must only be called for blob ranges of blocky relishes.
|
||||
fn get_num_blocks(blob_range: &BlobRange) -> u32 {
|
||||
assert_eq!(blob_range.size % BLOCK_SIZE, 0);
|
||||
(blob_range.size / BLOCK_SIZE).try_into().unwrap()
|
||||
}
|
||||
|
||||
@@ -106,7 +106,11 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_seg: self.seg,
|
||||
end_seg: SegmentTag {
|
||||
rel: self.seg.rel,
|
||||
segno: self.seg.segno + 1,
|
||||
},
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped: inner.dropped,
|
||||
@@ -150,11 +154,14 @@ impl Layer for InMemoryLayer {
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
assert_eq!(self.seg, seg); // TODO
|
||||
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
@@ -193,6 +200,16 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& self.seg.rel.is_blocky()
|
||||
&& blknum - self.seg.segno * RELISH_SEG_SIZE >= self.get_seg_size(seg, lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
@@ -210,7 +227,9 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
|
||||
assert_eq!(self.seg, seg);
|
||||
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
@@ -222,7 +241,9 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
assert_eq!(self.seg, seg);
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
@@ -507,7 +528,7 @@ impl InMemoryLayer {
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
let size = src.get_seg_size(seg, start_lsn)?;
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
}
|
||||
|
||||
@@ -596,7 +617,7 @@ impl InMemoryLayer {
|
||||
true,
|
||||
&inner.page_versions,
|
||||
None,
|
||||
inner.segsizes.clone(),
|
||||
inner.segsizes.as_slice(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
@@ -630,7 +651,7 @@ impl InMemoryLayer {
|
||||
false,
|
||||
&inner.page_versions,
|
||||
Some(end_lsn_inclusive),
|
||||
segsizes,
|
||||
segsizes.as_slice(), // TODO avoid copy above
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
trace!(
|
||||
|
||||
@@ -169,7 +169,7 @@ impl LayerMap {
|
||||
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
|
||||
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
|
||||
{
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
if let Some(exists) = segentry.exists_at_lsn(*seg, lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
@@ -177,7 +177,7 @@ impl LayerMap {
|
||||
}
|
||||
_ => {
|
||||
if tag == None {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
if let Some(exists) = segentry.exists_at_lsn(*seg, lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
@@ -207,7 +207,7 @@ impl LayerMap {
|
||||
/// to avoid incorrectly making it visible.
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
|
||||
segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
@@ -292,9 +292,9 @@ struct SegEntry {
|
||||
impl SegEntry {
|
||||
/// Does the segment exist at given LSN?
|
||||
/// Return None if object is not found in this SegEntry.
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
|
||||
fn exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<Option<bool>> {
|
||||
if let Some(layer) = self.get(lsn) {
|
||||
Ok(Some(layer.get_seg_exists(lsn)?))
|
||||
Ok(Some(layer.get_seg_exists(seg, lsn)?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
@@ -139,6 +139,7 @@ pub trait Layer: Send + Sync {
|
||||
/// to collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
@@ -146,10 +147,10 @@ pub trait Layer: Send + Sync {
|
||||
) -> Result<PageReconstructResult>;
|
||||
|
||||
/// Return size of the segment at given LSN. (Only for blocky relations.)
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;
|
||||
fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does the segment exist at given LSN? Or was it dropped before it.
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
|
||||
fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Does this layer only contain some data for the segment (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use layered_repository::TIMELINES_SEGMENT_NAME;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
@@ -22,8 +22,8 @@ pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod virtual_file;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod defaults {
|
||||
@@ -105,7 +105,7 @@ impl PageServerConf {
|
||||
//
|
||||
|
||||
fn tenants_path(&self) -> PathBuf {
|
||||
self.workdir.join(TENANTS_SEGMENT_NAME)
|
||||
self.workdir.join("tenants")
|
||||
}
|
||||
|
||||
fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
|
||||
@@ -279,7 +279,8 @@ impl PageServerHandler {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
|
||||
.context("Cannot handle pagerequests for a remote timeline")?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
@@ -301,17 +302,17 @@ impl PageServerHandler {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
self.handle_get_rel_exists_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(&*timeline, &req)
|
||||
self.handle_get_nblocks_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
};
|
||||
|
||||
@@ -455,7 +456,8 @@ impl PageServerHandler {
|
||||
let _enter = span.enter();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
|
||||
.context("Cannot handle basebackup request for a remote timeline")?;
|
||||
if let Some(lsn) = lsn {
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn)
|
||||
@@ -595,7 +597,8 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
|
||||
.context("Failed to fetch local timeline for callmemaybe requests")?;
|
||||
|
||||
walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());
|
||||
|
||||
|
||||
@@ -8,8 +8,15 @@
|
||||
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//! Synchronization internals are split into submodules
|
||||
//! * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives
|
||||
//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
|
||||
//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
|
||||
//!
|
||||
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_checkpoint_upload`]
|
||||
//! * public API via to interact with the external world:
|
||||
//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
|
||||
//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
|
||||
//! to be processed by the async loop
|
||||
//!
|
||||
//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
|
||||
//!
|
||||
@@ -17,10 +24,10 @@
|
||||
//! | | - - - (init async loop) - - - -> | |
|
||||
//! | | | |
|
||||
//! | | -------------------------------> | async |
|
||||
//! | pageserver | (schedule checkpoint upload) | upload/download |
|
||||
//! | pageserver | (enqueue timeline sync task) | upload/download |
|
||||
//! | | | loop |
|
||||
//! | | <------------------------------- | |
|
||||
//! | | (register downloaded layers) | |
|
||||
//! | | (apply new timeline sync states) | |
|
||||
//! +------------------------+ +---------<-------+
|
||||
//! |
|
||||
//! |
|
||||
@@ -36,94 +43,260 @@
|
||||
//! | access to this storage |
|
||||
//! +------------------------+
|
||||
//!
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
|
||||
//! The loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
|
||||
//! query their downloads later if they are accessed.
|
||||
//!
|
||||
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
|
||||
//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`],
|
||||
//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Such submissions happen in two cases:
|
||||
//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
|
||||
//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
|
||||
//!
|
||||
//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
|
||||
//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer timeline is downloaded.
|
||||
//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files.
|
||||
//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
|
||||
//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
|
||||
//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer image is downloaded
|
||||
//!
|
||||
//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
|
||||
//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
|
||||
//!
|
||||
//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
|
||||
//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata.
|
||||
//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the uploads do not happen right after pageserver startup, they are registered when
|
||||
//! 1. pageserver does the checkpoint, which happens further in the future after the server start
|
||||
//! 2. pageserver loads the timeline from disk for the first time
|
||||
//!
|
||||
//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
|
||||
//!
|
||||
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
|
||||
//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
|
||||
//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
|
||||
//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
|
||||
//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ffi, fs,
|
||||
path::{Path, PathBuf},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::io;
|
||||
use tracing::{error, info};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub use self::storage_sync::schedule_timeline_checkpoint_upload;
|
||||
pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
|
||||
use self::{local_fs::LocalFs, rust_s3::S3};
|
||||
use crate::{PageServerConf, RemoteStorageKind};
|
||||
use crate::{
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
repository::TimelineSyncState,
|
||||
PageServerConf, RemoteStorageKind,
|
||||
};
|
||||
|
||||
/// Any timeline has its own id and its own tenant it belongs to,
|
||||
/// the sync processes group timelines by both for simplicity.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TimelineSyncId(ZTenantId, ZTimelineId);
|
||||
|
||||
impl std::fmt::Display for TimelineSyncId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "(tenant id: {}, timeline id: {})", self.0, self.1)
|
||||
}
|
||||
}
|
||||
|
||||
/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
|
||||
/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
|
||||
/// to simplify the received code.
|
||||
pub struct SyncStartupData {
|
||||
/// A sync state, derived from initial comparison of local timeline files and the remote archives,
|
||||
/// before any sync tasks are executed.
|
||||
/// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
|
||||
/// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
|
||||
pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
|
||||
/// A handle to the sync loop, if it was started from the configuration provided.
|
||||
pub sync_loop_handle: Option<thread::JoinHandle<anyhow::Result<()>>>,
|
||||
}
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
/// If no external configuraion connection given, no thread or storage initialization is done.
|
||||
pub fn run_storage_sync_thread(
|
||||
/// If no external configuration connection given, no thread or storage initialization is done.
|
||||
/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
|
||||
pub fn start_local_timeline_sync(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
) -> anyhow::Result<SyncStartupData> {
|
||||
let local_timeline_files = local_tenant_timeline_files(config)
|
||||
.context("Failed to collect local tenant timeline files")?;
|
||||
|
||||
match &config.remote_storage_config {
|
||||
Some(storage_config) => {
|
||||
let max_concurrent_sync = storage_config.max_concurrent_sync;
|
||||
let max_sync_errors = storage_config.max_sync_errors;
|
||||
let handle = match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
S3::new(s3_config, &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
};
|
||||
handle.map(Some)
|
||||
Some(storage_config) => match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
storage_config.max_concurrent_sync,
|
||||
storage_config.max_sync_errors,
|
||||
),
|
||||
RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
S3::new(s3_config, &config.workdir)?,
|
||||
storage_config.max_concurrent_sync,
|
||||
storage_config.max_sync_errors,
|
||||
),
|
||||
}
|
||||
.context("Failed to spawn the storage sync thread"),
|
||||
None => {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
let mut initial_timeline_states: HashMap<
|
||||
ZTenantId,
|
||||
HashMap<ZTimelineId, TimelineSyncState>,
|
||||
> = HashMap::new();
|
||||
for TimelineSyncId(tenant_id, timeline_id) in local_timeline_files.into_keys() {
|
||||
initial_timeline_states
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, TimelineSyncState::Ready);
|
||||
}
|
||||
Ok(SyncStartupData {
|
||||
initial_timeline_states,
|
||||
sync_loop_handle: None,
|
||||
})
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &tenants_dir_entry {
|
||||
Ok(tenants_dir_entry) => {
|
||||
match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
|
||||
Ok(collected_files) => {
|
||||
local_tenant_timeline_files.extend(collected_files.into_iter())
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
tenants_dir_entry,
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:#}",
|
||||
tenants_dir_entry,
|
||||
tenants_dir.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(local_tenant_timeline_files)
|
||||
}
|
||||
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
let mut timelines: HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)> = HashMap::new();
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
let timelines_dir = config.timelines_path(&tenant_id);
|
||||
|
||||
for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to list timelines dir entry for tenant {}",
|
||||
tenant_id
|
||||
)
|
||||
})? {
|
||||
match timelines_dir_entry {
|
||||
Ok(timelines_dir_entry) => {
|
||||
let timeline_path = timelines_dir_entry.path();
|
||||
match collect_timeline_files(&timeline_path) {
|
||||
Ok((timeline_id, metadata, timeline_files)) => {
|
||||
timelines.insert(
|
||||
TimelineSyncId(tenant_id, timeline_id),
|
||||
(metadata, timeline_files),
|
||||
);
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to process timeline dir contents at '{}', reason: {:#}",
|
||||
timeline_path.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list timelines for entry tenant {}, reason: {:#}",
|
||||
tenant_id, e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
|
||||
let mut timeline_files = Vec::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTimelineId>()
|
||||
.context("Could not parse timeline id out of the timeline dir name")?;
|
||||
let timeline_dir_entries =
|
||||
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||
for entry in timeline_dir_entries {
|
||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||
timeline_metadata_path = Some(entry_path);
|
||||
} else {
|
||||
timeline_files.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_metadata_path = match timeline_metadata_path {
|
||||
Some(path) => path,
|
||||
None => bail!("No metadata file found in the timeline directory"),
|
||||
};
|
||||
let metadata = TimelineMetadata::from_bytes(
|
||||
&fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
|
||||
)
|
||||
.context("Failed to parse timeline metadata file bytes")?;
|
||||
|
||||
Ok((timeline_id, metadata, timeline_files))
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations with storage files.
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
|
||||
@@ -16,8 +16,16 @@ This way, the backups are managed in background, not affecting directly other pa
|
||||
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* uploads layers, frozen by pageserver checkpoint thread
|
||||
* downloads and registers layers, found on the remote storage, but missing locally
|
||||
* synchronizes the differences with local timelines and remote states as fast as possible
|
||||
* uploads new relishes, frozen by pageserver checkpoint thread
|
||||
* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
|
||||
* uses compression when deals with files, for better S3 usage
|
||||
* maintains an index of what's stored remotely
|
||||
* evicts failing tasks and stops the corresponding timelines
|
||||
|
||||
The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks.
|
||||
After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline),
|
||||
the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change.
|
||||
|
||||
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
|
||||
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
|
||||
@@ -27,21 +35,16 @@ It's planned to deal with all questions that are currently on and prepare the fe
|
||||
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
|
||||
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
|
||||
AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
|
||||
Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
|
||||
* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`).
|
||||
We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary.
|
||||
But, it's a single blob, which is way better than storing ~780 small files separately.
|
||||
|
||||
Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
|
||||
* Archive index restoration requires reading every blob's head.
|
||||
This could be avoided by a background thread/future storing the serialized index in the remote storage.
|
||||
|
||||
* no proper file comparison
|
||||
|
||||
Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
|
||||
Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
|
||||
No file contents assertion is done currently, but should be.
|
||||
AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
|
||||
|
||||
For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
|
||||
No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)
|
||||
|
||||
* sad rust-s3 api
|
||||
|
||||
@@ -55,19 +58,18 @@ But it's already used in the project, so for now it's reused to avoid bloating t
|
||||
Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
|
||||
|
||||
|
||||
* gc and branches are ignored
|
||||
* gc is ignored
|
||||
|
||||
So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
|
||||
Only checkpointer loop affects the remote storage.
|
||||
So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
|
||||
Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
|
||||
|
||||
* more layers should be downloaded on demand
|
||||
* bracnhes implementaion could be improved
|
||||
|
||||
Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
|
||||
Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
|
||||
so the pageserver is unable to respond property on requests to such ancestors.
|
||||
Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
|
||||
on the timeline download, missing remote branch files are downlaoded.
|
||||
|
||||
To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
|
||||
[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
|
||||
A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
|
||||
Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.
|
||||
|
||||
* no IT tests
|
||||
|
||||
|
||||
@@ -355,7 +355,7 @@ mod pure_tests {
|
||||
.local_path(
|
||||
&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
|
||||
)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
@@ -558,7 +558,7 @@ mod fs_tests {
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
"First part bytes should be returned when requrested"
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
@@ -575,7 +575,7 @@ mod fs_tests {
|
||||
assert_eq!(
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
"Second part bytes should be returned when requrested"
|
||||
"Second part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
611
pageserver/src/remote_storage/storage_sync/compression.rs
Normal file
611
pageserver/src/remote_storage/storage_sync/compression.rs
Normal file
@@ -0,0 +1,611 @@
|
||||
//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data,
|
||||
//! without holding the entire data in memory.
|
||||
//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]),
|
||||
//! not attempting to hold the entire archive in memory.
|
||||
//!
|
||||
//! The compression is done with <a href="https://datatracker.ietf.org/doc/html/rfc8878">zstd</a> streaming algorithm via the `async-compression` crate.
|
||||
//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive.
|
||||
//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression.
|
||||
//!
|
||||
//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
|
||||
//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
|
||||
//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
|
||||
//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first.
|
||||
//!
|
||||
//! Archive structure:
|
||||
//! +----------------------------------------+
|
||||
//! | header | file_1, ..., file_k, metadata |
|
||||
//! +----------------------------------------+
|
||||
//!
|
||||
//! The archive consists of two separate zstd archives:
|
||||
//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory
|
||||
//! Header is a Rust structure, serialized into bytes and compressed with zstd.
|
||||
//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob
|
||||
//!
|
||||
//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file.
|
||||
//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`.
|
||||
//! This way, the header could be retrieved without reading an entire archive file.
|
||||
|
||||
use std::{
|
||||
collections::BTreeSet,
|
||||
future::Future,
|
||||
io::Cursor,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME};
|
||||
|
||||
use super::index::RelativePath;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct ArchiveHeader {
|
||||
/// All regular timeline files, excluding the metadata file.
|
||||
pub files: Vec<FileEntry>,
|
||||
// Metadata file name is known to the system, as its location relative to the timeline dir,
|
||||
// so no need to store anything but its size in bytes.
|
||||
pub metadata_file_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct FileEntry {
|
||||
/// Uncompressed file size, bytes.
|
||||
pub size: u64,
|
||||
/// A path, relative to the directory root, used when compressing the directory contents.
|
||||
pub subpath: RelativePath,
|
||||
}
|
||||
|
||||
const ARCHIVE_EXTENSION: &str = "-.zst_";
|
||||
const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024;
|
||||
|
||||
/// Streams an archive of files given into a stream target, defined by the closure.
|
||||
///
|
||||
/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into.
|
||||
/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents.
|
||||
///
|
||||
/// Performs the compression in multiple steps:
|
||||
/// * prepares an archive header, stripping the `source_dir` prefix from the `files`
|
||||
/// * generates the name of the archive
|
||||
/// * prepares archive producer future, knowing the header and the file list
|
||||
/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming.
|
||||
/// The writer end gets into the archive producer future, to put the header and a stream of compressed files.
|
||||
/// * prepares archive consumer future, by executing the provided closure
|
||||
/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere.
|
||||
/// * runs and waits for both futures to complete
|
||||
/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned
|
||||
/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data,
|
||||
/// needed for future processing.
|
||||
pub async fn archive_files_as_stream<Cons, ConsRet, Fut>(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata: &TimelineMetadata,
|
||||
create_archive_consumer: Cons,
|
||||
) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)>
|
||||
where
|
||||
Cons: FnOnce(Box<dyn io::AsyncRead + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ConsRet>> + Send + 'static,
|
||||
ConsRet: Send + Sync + 'static,
|
||||
{
|
||||
let metadata_bytes = metadata
|
||||
.to_bytes()
|
||||
.context("Failed to create metadata bytes")?;
|
||||
let (archive_header, compressed_header_bytes) =
|
||||
prepare_header(source_dir, files, &metadata_bytes)
|
||||
.await
|
||||
.context("Failed to prepare file for archivation")?;
|
||||
|
||||
let header_size = compressed_header_bytes.len() as u64;
|
||||
let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_filler = write_archive_contents(
|
||||
source_dir.to_path_buf(),
|
||||
archive_header.clone(),
|
||||
metadata_bytes,
|
||||
write,
|
||||
);
|
||||
let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size);
|
||||
let archive_stream =
|
||||
Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read)));
|
||||
|
||||
let (archive_creation_result, archive_upload_result) = tokio::join!(
|
||||
tokio::spawn(archive_filler),
|
||||
tokio::spawn(async move {
|
||||
create_archive_consumer(Box::new(archive_stream), archive_name).await
|
||||
})
|
||||
);
|
||||
archive_creation_result
|
||||
.context("Failed to spawn archive creation future")?
|
||||
.context("Failed to create an archive")?;
|
||||
let upload_return_value = archive_upload_result
|
||||
.context("Failed to spawn archive upload future")?
|
||||
.context("Failed to upload the archive")?;
|
||||
|
||||
Ok((archive_header, header_size, upload_return_value))
|
||||
}
|
||||
|
||||
/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive,
|
||||
/// that contains files and is located after the header.
|
||||
/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval,
|
||||
/// a closure is used.
|
||||
/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the
|
||||
/// consumer and the receiver ends are swapped, since the uncompression happens.
|
||||
pub async fn uncompress_file_stream_with_index<Prod, ProdRet, Fut>(
|
||||
destination_dir: PathBuf,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
create_archive_file_part: Prod,
|
||||
) -> anyhow::Result<ProdRet>
|
||||
where
|
||||
Prod: FnOnce(Box<dyn io::AsyncWrite + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ProdRet>> + Send + 'static,
|
||||
ProdRet: Send + Sync + 'static,
|
||||
{
|
||||
let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_name = archive_name(disk_consistent_lsn, header_size);
|
||||
|
||||
let (archive_download_result, archive_uncompress_result) = tokio::join!(
|
||||
tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }),
|
||||
tokio::spawn(async move {
|
||||
uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await
|
||||
})
|
||||
);
|
||||
|
||||
let download_value = archive_download_result
|
||||
.context("Failed to spawn archive download future")?
|
||||
.context("Failed to download an archive")?;
|
||||
archive_uncompress_result
|
||||
.context("Failed to spawn archive uncompress future")?
|
||||
.context("Failed to uncompress the archive")?;
|
||||
|
||||
Ok(download_value)
|
||||
}
|
||||
|
||||
/// Reads archive header from the stream given:
|
||||
/// * parses the file name to get the header size
|
||||
/// * reads the exact amount of bytes
|
||||
/// * uncompresses and deserializes those
|
||||
pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
|
||||
archive_name: &str,
|
||||
from: &mut A,
|
||||
) -> anyhow::Result<ArchiveHeader> {
|
||||
let (_, header_size) = parse_archive_name(Path::new(archive_name))?;
|
||||
|
||||
let mut compressed_header_bytes = vec![0; header_size as usize];
|
||||
from.read_exact(&mut compressed_header_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read header header from the archive {}",
|
||||
archive_name
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut header_bytes = Vec::new();
|
||||
ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice()))
|
||||
.read_to_end(&mut header_bytes)
|
||||
.await
|
||||
.context("Failed to decompress a header from the archive")?;
|
||||
|
||||
Ok(ArchiveHeader::des(&header_bytes)
|
||||
.context("Failed to deserialize a header from the archive")?)
|
||||
}
|
||||
|
||||
/// Reads the archive metadata out of the archive name:
|
||||
/// * `disk_consistent_lsn` of the checkpoint that was archived
|
||||
/// * size of the archive header
|
||||
pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
|
||||
let archive_name = archive_path
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Archive '{}' has no file name", archive_path.display()))?
|
||||
.to_string_lossy();
|
||||
let (lsn_str, header_size_str) =
|
||||
archive_name.rsplit_once(ARCHIVE_EXTENSION).ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Archive '{}' has incorrect extension, expected to contain '{}'",
|
||||
archive_path.display(),
|
||||
ARCHIVE_EXTENSION
|
||||
)
|
||||
})?;
|
||||
let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid disk consistent lsn in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
let header_size = header_size_str.parse::<u64>().with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid a header offset number in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
Ok((disk_consistent_lsn, header_size))
|
||||
}
|
||||
|
||||
fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
|
||||
let archive_name = format!(
|
||||
"{:016X}{ARCHIVE_EXTENSION}{}",
|
||||
u64::from(disk_consistent_lsn),
|
||||
header_size,
|
||||
ARCHIVE_EXTENSION = ARCHIVE_EXTENSION,
|
||||
);
|
||||
archive_name
|
||||
}
|
||||
|
||||
async fn uncompress_with_header(
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
destination_dir: &Path,
|
||||
header: ArchiveHeader,
|
||||
archive_after_header: impl io::AsyncRead + Send + Sync + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Uncompressing archive into {}", destination_dir.display());
|
||||
let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header));
|
||||
|
||||
if !destination_dir.exists() {
|
||||
fs::create_dir_all(&destination_dir)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create target directory at {}",
|
||||
destination_dir.display()
|
||||
)
|
||||
})?;
|
||||
} else if !destination_dir.is_dir() {
|
||||
bail!(
|
||||
"Destination path '{}' is not a valid directory",
|
||||
destination_dir.display()
|
||||
);
|
||||
}
|
||||
debug!("Will extract {} files from the archive", header.files.len());
|
||||
for entry in header.files {
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&entry.subpath.as_path(destination_dir),
|
||||
entry.size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?;
|
||||
}
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&destination_dir.join(METADATA_FILE_NAME),
|
||||
header.metadata_file_size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.context("Failed to uncompress the metadata entry")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn uncompress_entry(
|
||||
archive: &mut ZstdDecoder<io::BufReader<impl io::AsyncRead + Send + Sync + Unpin>>,
|
||||
destination_path: &Path,
|
||||
entry_size: u64,
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(parent) = destination_path.parent() {
|
||||
fs::create_dir_all(parent).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create parent directory for {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
};
|
||||
|
||||
if files_to_skip.contains(destination_path) {
|
||||
debug!("Skipping {}", destination_path.display());
|
||||
copy_n_bytes(entry_size, archive, &mut io::sink())
|
||||
.await
|
||||
.context("Failed to skip bytes in the archive")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut destination =
|
||||
io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file {} for extraction",
|
||||
destination_path.display()
|
||||
)
|
||||
})?);
|
||||
copy_n_bytes(entry_size, archive, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to write extracted archive contents into file {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
destination
|
||||
.flush()
|
||||
.await
|
||||
.context("Failed to flush the streaming archive bytes")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write_archive_contents(
|
||||
source_dir: PathBuf,
|
||||
header: ArchiveHeader,
|
||||
metadata_bytes: Vec<u8>,
|
||||
mut archive_input: io::DuplexStream,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Starting writing files into archive");
|
||||
for file_entry in header.files {
|
||||
let path = file_entry.subpath.as_path(&source_dir);
|
||||
let mut source_file =
|
||||
io::BufReader::new(fs::File::open(&path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file for archiving to path {}",
|
||||
path.display()
|
||||
)
|
||||
})?);
|
||||
let bytes_written = io::copy(&mut source_file, &mut archive_input)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open add a file into archive, file path {}",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_entry.size == bytes_written,
|
||||
"File {} was written to the archive incompletely",
|
||||
path.display()
|
||||
);
|
||||
trace!(
|
||||
"Added file '{}' ({} bytes) into the archive",
|
||||
path.display(),
|
||||
bytes_written
|
||||
);
|
||||
}
|
||||
let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
|
||||
.await
|
||||
.with_context(|| "Failed to add metadata into the archive")?;
|
||||
ensure!(
|
||||
header.metadata_file_size == metadata_bytes_written,
|
||||
"Metadata file was written to the archive incompletely",
|
||||
);
|
||||
|
||||
archive_input
|
||||
.shutdown()
|
||||
.await
|
||||
.context("Failed to finalize the archive")?;
|
||||
debug!("Successfully streamed all files into the archive");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn prepare_header(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata_bytes: &[u8],
|
||||
) -> anyhow::Result<(ArchiveHeader, Vec<u8>)> {
|
||||
let mut archive_files = Vec::new();
|
||||
for file_path in files {
|
||||
let file_metadata = fs::metadata(file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata during archive indexing for {}",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_metadata.is_file(),
|
||||
"Archive indexed path {} is not a file",
|
||||
file_path.display()
|
||||
);
|
||||
|
||||
if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) {
|
||||
let entry = FileEntry {
|
||||
subpath: RelativePath::new(source_dir, file_path).with_context(|| {
|
||||
format!(
|
||||
"File '{}' does not belong to pageserver workspace",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
size: file_metadata.len(),
|
||||
};
|
||||
archive_files.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
let header = ArchiveHeader {
|
||||
files: archive_files,
|
||||
metadata_file_size: metadata_bytes.len() as u64,
|
||||
};
|
||||
|
||||
debug!("Appending a header for {} files", header.files.len());
|
||||
let header_bytes = header.ser().context("Failed to serialize a header")?;
|
||||
debug!("Header bytes len {}", header_bytes.len());
|
||||
let mut compressed_header_bytes = Vec::new();
|
||||
ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice()))
|
||||
.read_to_end(&mut compressed_header_bytes)
|
||||
.await
|
||||
.context("Failed to compress header bytes")?;
|
||||
debug!(
|
||||
"Compressed header bytes len {}",
|
||||
compressed_header_bytes.len()
|
||||
);
|
||||
Ok((header, compressed_header_bytes))
|
||||
}
|
||||
|
||||
async fn copy_n_bytes(
|
||||
n: u64,
|
||||
from: &mut (impl io::AsyncRead + Send + Sync + Unpin),
|
||||
into: &mut (impl io::AsyncWrite + Send + Sync + Unpin),
|
||||
) -> anyhow::Result<()> {
|
||||
let bytes_written = io::copy(&mut from.take(n), into).await?;
|
||||
ensure!(
|
||||
bytes_written == n,
|
||||
"Failed to read exactly {} bytes from the input, bytes written: {}",
|
||||
n,
|
||||
bytes_written,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokio::{fs, io::AsyncSeekExt};
|
||||
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn compress_and_uncompress() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("compress_and_uncompress")?;
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
init_directory(
|
||||
&timeline_dir,
|
||||
vec![
|
||||
("first", "first_contents"),
|
||||
("second", "second_contents"),
|
||||
(METADATA_FILE_NAME, "wrong_metadata"),
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
let timeline_files = list_file_paths_with_contents(&timeline_dir).await?;
|
||||
assert_eq!(
|
||||
timeline_files,
|
||||
vec![
|
||||
(
|
||||
timeline_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Text("wrong_metadata".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Initial timeline contents should contain two normal files and a wrong metadata file"
|
||||
);
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0));
|
||||
let paths_to_archive = timeline_files
|
||||
.into_iter()
|
||||
.map(|(path, _)| path)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let tempdir = tempfile::tempdir()?;
|
||||
let base_path = tempdir.path().to_path_buf();
|
||||
let (header, header_size, archive_target) = archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
paths_to_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = base_path.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut file = fs::File::open(&archive_target).await?;
|
||||
file.seek(io::SeekFrom::Start(header_size)).await?;
|
||||
let target_dir = tempdir.path().join("extracted");
|
||||
uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?;
|
||||
|
||||
let extracted_files = list_file_paths_with_contents(&target_dir).await?;
|
||||
|
||||
assert_eq!(
|
||||
extracted_files,
|
||||
vec![
|
||||
(
|
||||
target_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
target_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Binary(metadata.to_bytes()?)
|
||||
),
|
||||
(
|
||||
target_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn init_directory(
|
||||
root: &Path,
|
||||
files_with_contents: Vec<(&str, &str)>,
|
||||
) -> anyhow::Result<()> {
|
||||
fs::create_dir_all(root).await?;
|
||||
for (file_name, contents) in files_with_contents {
|
||||
fs::File::create(root.join(file_name))
|
||||
.await?
|
||||
.write_all(contents.as_bytes())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum FileContents {
|
||||
Text(String),
|
||||
Binary(Vec<u8>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FileContents {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Text(text) => f.debug_tuple("Text").field(text).finish(),
|
||||
Self::Binary(bytes) => f
|
||||
.debug_tuple("Binary")
|
||||
.field(&format!("{} bytes", bytes.len()))
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_file_paths_with_contents(
|
||||
root: &Path,
|
||||
) -> anyhow::Result<Vec<(PathBuf, FileContents)>> {
|
||||
let mut file_paths = Vec::new();
|
||||
|
||||
let mut dir_listings = vec![fs::read_dir(root).await?];
|
||||
while let Some(mut dir_listing) = dir_listings.pop() {
|
||||
while let Some(entry) = dir_listing.next_entry().await? {
|
||||
let entry_path = entry.path();
|
||||
if entry_path.is_file() {
|
||||
let contents = match String::from_utf8(fs::read(&entry_path).await?) {
|
||||
Ok(text) => FileContents::Text(text),
|
||||
Err(e) => FileContents::Binary(e.into_bytes()),
|
||||
};
|
||||
file_paths.push((entry_path, contents));
|
||||
} else if entry_path.is_dir() {
|
||||
dir_listings.push(fs::read_dir(entry_path).await?);
|
||||
} else {
|
||||
info!(
|
||||
"Skipping path '{}' as it's not a file or a directory",
|
||||
entry_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file_paths.sort();
|
||||
Ok(file_paths)
|
||||
}
|
||||
}
|
||||
370
pageserver/src/remote_storage/storage_sync/download.rs
Normal file
370
pageserver/src/remote_storage/storage_sync/download.rs
Normal file
@@ -0,0 +1,370 @@
|
||||
//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
|
||||
//! Currently, tenant branch files are also downloaded, but this does not appear final.
|
||||
|
||||
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use tokio::{fs, sync::RwLock};
|
||||
use tracing::{debug, error, warn};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::{
|
||||
layered_repository::metadata::{metadata_path, TimelineMetadata},
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
|
||||
update_index_description, SyncKind, SyncTask,
|
||||
},
|
||||
RemoteStorage, TimelineSyncId,
|
||||
},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
use super::{
|
||||
index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex},
|
||||
TimelineDownload,
|
||||
};
|
||||
|
||||
/// Attempts to download and uncompress files from all remote archives for the timeline given.
|
||||
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
|
||||
/// updated in the end of every checkpoint archive extraction.
|
||||
///
|
||||
/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
|
||||
/// (for any new successful archive downloads and extractions).
|
||||
pub(super) async fn download_timeline<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
|
||||
sync_id: TimelineSyncId,
|
||||
mut download: TimelineDownload,
|
||||
retries: u32,
|
||||
) -> Option<bool> {
|
||||
debug!("Downloading layers for sync id {}", sync_id);
|
||||
if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.0).await {
|
||||
error!(
|
||||
"Failed to download missing branches for sync id {}: {:#}",
|
||||
sync_id, e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
|
||||
let TimelineSyncId(tenant_id, timeline_id) = sync_id;
|
||||
|
||||
let index_read = remote_assets.1.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
None => {
|
||||
error!("Cannot download: no timeline is present in the index for given ids");
|
||||
return None;
|
||||
}
|
||||
Some(TimelineIndexEntry::Full(remote_timeline)) => Cow::Borrowed(remote_timeline),
|
||||
Some(TimelineIndexEntry::Description(_)) => {
|
||||
drop(index_read);
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match update_index_description(
|
||||
remote_assets.as_ref(),
|
||||
&conf.timeline_path(&timeline_id, &tenant_id),
|
||||
sync_id,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(remote_timeline) => Cow::Owned(remote_timeline),
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:#}", e);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut archives_to_download = remote_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.filter(|remote_archive| !download.archives_to_skip.contains(remote_archive))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let archives_total = archives_to_download.len();
|
||||
debug!("Downloading {} archives of a timeline", archives_total);
|
||||
|
||||
while let Some(archive_id) = archives_to_download.pop() {
|
||||
match try_download_archive(
|
||||
conf,
|
||||
sync_id,
|
||||
Arc::clone(&remote_assets),
|
||||
remote_timeline.as_ref(),
|
||||
archive_id,
|
||||
Arc::clone(&download.files_to_skip),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
let archives_left = archives_to_download.len();
|
||||
error!(
|
||||
"Failed to download archive {:?} for tenant {} timeline {} : {:#}, requeueing the download ({} archives left out of {})",
|
||||
archive_id, tenant_id, timeline_id, e, archives_left, archives_total
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
Ok(()) => {
|
||||
debug!("Successfully downloaded archive {:?}", archive_id);
|
||||
download.archives_to_skip.insert(archive_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Finished downloading all timeline's archives");
|
||||
Some(true)
|
||||
}
|
||||
|
||||
async fn try_download_archive<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
TimelineSyncId(tenant_id, timeline_id): TimelineSyncId,
|
||||
remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
|
||||
remote_timeline: &RemoteTimeline,
|
||||
archive_id: ArchiveId,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Downloading archive {:?}", archive_id);
|
||||
let archive_to_download = remote_timeline
|
||||
.archive_data(archive_id)
|
||||
.ok_or_else(|| anyhow!("Archive {:?} not found in remote storage", archive_id))?;
|
||||
let (archive_header, header_size) = remote_timeline
|
||||
.restore_header(archive_id)
|
||||
.context("Failed to restore header when downloading an archive")?;
|
||||
|
||||
match read_local_metadata(conf, timeline_id, tenant_id).await {
|
||||
Ok(local_metadata) => ensure!(
|
||||
// need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded
|
||||
local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(),
|
||||
"Cannot download archive with LSN {} since it's earlier than local LSN {}",
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
local_metadata.disk_consistent_lsn()
|
||||
),
|
||||
Err(e) => warn!("Failed to read local metadata file, assuing it's safe to override its with the download. Read: {:#}", e),
|
||||
}
|
||||
compression::uncompress_file_stream_with_index(
|
||||
conf.timeline_path(&timeline_id, &tenant_id),
|
||||
files_to_skip,
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
move |mut archive_target, archive_name| async move {
|
||||
let archive_local_path = conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(&archive_name);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.download_range(
|
||||
&remote_storage.storage_path(&archive_local_path)?,
|
||||
header_size,
|
||||
None,
|
||||
&mut archive_target,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_local_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: zenith_utils::zid::ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let local_metadata_path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let local_metadata_bytes = fs::read(&local_metadata_path)
|
||||
.await
|
||||
.context("Failed to read local metadata file bytes")?;
|
||||
Ok(TimelineMetadata::from_bytes(&local_metadata_bytes)
|
||||
.context("Failed to read local metadata files bytes")?)
|
||||
}
|
||||
|
||||
async fn download_missing_branches<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_branches = tenant_branch_files(conf, tenant_id)
|
||||
.await
|
||||
.context("Failed to list local branch files for the tenant")?;
|
||||
let local_branches_dir = conf.branches_path(&tenant_id);
|
||||
if !local_branches_dir.exists() {
|
||||
fs::create_dir_all(&local_branches_dir)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create local branches directory at path '{}'",
|
||||
local_branches_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
|
||||
let mut remote_only_branches_downloads = remote_branches
|
||||
.difference(&local_branches)
|
||||
.map(|remote_only_branch| async move {
|
||||
let branches_dir = conf.branches_path(&tenant_id);
|
||||
let remote_branch_path = remote_only_branch.as_path(&branches_dir);
|
||||
let storage_path =
|
||||
storage.storage_path(&remote_branch_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to derive a storage path for branch with local path '{}'",
|
||||
remote_branch_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut target_file = fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&remote_branch_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create local branch file at '{}'",
|
||||
remote_branch_path.display()
|
||||
)
|
||||
})?;
|
||||
storage
|
||||
.download(&storage_path, &mut target_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download branch file from the remote path {:?}",
|
||||
storage_path
|
||||
)
|
||||
})?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut branch_downloads_failed = false;
|
||||
while let Some(download_result) = remote_only_branches_downloads.next().await {
|
||||
if let Err(e) = download_result {
|
||||
branch_downloads_failed = true;
|
||||
error!("Failed to download a branch file: {:#}", e);
|
||||
}
|
||||
}
|
||||
ensure!(
|
||||
!branch_downloads_failed,
|
||||
"Failed to download all branch files"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use tempfile::tempdir;
|
||||
use tokio::fs;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
local_fs::LocalFs,
|
||||
storage_sync::test_utils::{
|
||||
assert_index_descriptions, assert_timeline_files_match, create_local_timeline,
|
||||
dummy_metadata, ensure_correct_timeline_upload, expect_timeline,
|
||||
},
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_timeline() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let tempdir_path = tempdir.path();
|
||||
let _ = zenith_utils::logging::init(tempdir_path.join("log.log"), false);
|
||||
|
||||
let repo_harness = RepoHarness::create("test_download_timeline")?;
|
||||
let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir_path.to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
));
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let regular_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
dummy_metadata(Lsn(0x30)),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
regular_timeline,
|
||||
)
|
||||
.await;
|
||||
fs::remove_dir_all(®ular_timeline_path).await?;
|
||||
let remote_regular_timeline = expect_timeline(index, sync_id).await;
|
||||
|
||||
download_timeline(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
TimelineDownload {
|
||||
files_to_skip: Arc::new(BTreeSet::new()),
|
||||
archives_to_skip: BTreeSet::new(),
|
||||
},
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(
|
||||
index,
|
||||
RemoteTimelineIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
408
pageserver/src/remote_storage/storage_sync/index.rs
Normal file
408
pageserver/src/remote_storage/storage_sync/index.rs
Normal file
@@ -0,0 +1,408 @@
|
||||
//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files.
|
||||
//! Able to restore itself from the storage archive data and reconstruct archive indices on demand.
|
||||
//!
|
||||
//! The index is intended to be portable, so deliberately does not store any local paths inside.
|
||||
//! This way in the future, the index could be restored fast from its serialized stored form.
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::debug;
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
layered_repository::TIMELINES_SEGMENT_NAME,
|
||||
remote_storage::{
|
||||
storage_sync::compression::{parse_archive_name, FileEntry},
|
||||
TimelineSyncId,
|
||||
},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
use super::compression::ArchiveHeader;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
|
||||
let relative = path
|
||||
.as_ref()
|
||||
.strip_prefix(base)
|
||||
.context("path is not relative to base")?;
|
||||
Ok(RelativePath(relative.to_string_lossy().to_string()))
|
||||
}
|
||||
|
||||
/// Joins the relative path with the base path.
|
||||
pub fn as_path(&self, base: &Path) -> PathBuf {
|
||||
base.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
/// Currently, timeline archives and branch files are tracked.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
|
||||
timeline_files: HashMap<TimelineSyncId, TimelineIndexEntry>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineIndex {
|
||||
/// Attempts to parse file paths (not checking the file contents) and find files
|
||||
/// that can be tracked wiht the index.
|
||||
/// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
|
||||
pub fn try_parse_descriptions_from_paths<P: AsRef<Path>>(
|
||||
conf: &'static PageServerConf,
|
||||
paths: impl Iterator<Item = P>,
|
||||
) -> Self {
|
||||
let mut index = Self {
|
||||
branch_files: HashMap::new(),
|
||||
timeline_files: HashMap::new(),
|
||||
};
|
||||
for path in paths {
|
||||
if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
|
||||
debug!(
|
||||
"Failed to parse path '{}' as index entry: {:#}",
|
||||
path.as_ref().display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
pub fn timeline_entry(&self, id: &TimelineSyncId) -> Option<&TimelineIndexEntry> {
|
||||
self.timeline_files.get(id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(&mut self, id: &TimelineSyncId) -> Option<&mut TimelineIndexEntry> {
|
||||
self.timeline_files.get_mut(id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(&mut self, id: TimelineSyncId, entry: TimelineIndexEntry) {
|
||||
self.timeline_files.insert(id, entry);
|
||||
}
|
||||
|
||||
pub fn all_sync_ids(&self) -> impl Iterator<Item = TimelineSyncId> + '_ {
|
||||
self.timeline_files.keys().copied()
|
||||
}
|
||||
|
||||
pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
|
||||
self.branch_files
|
||||
.entry(tenant_id)
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(path);
|
||||
}
|
||||
|
||||
pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
|
||||
self.branch_files.get(&tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TimelineIndexEntry {
|
||||
/// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents.
|
||||
Description(BTreeMap<ArchiveId, ArchiveDescription>),
|
||||
/// Full archive metadata, including the file list, parsed from the archive header.
|
||||
Full(RemoteTimeline),
|
||||
}
|
||||
|
||||
impl TimelineIndexEntry {
|
||||
pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
|
||||
match self {
|
||||
TimelineIndexEntry::Description(description) => {
|
||||
description.keys().map(|archive_id| archive_id.0).collect()
|
||||
}
|
||||
TimelineIndexEntry::Full(remote_timeline) => remote_timeline
|
||||
.checkpoint_archives
|
||||
.keys()
|
||||
.map(|archive_id| archive_id.0)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct ArchiveId(pub(super) Lsn);
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
struct FileId(ArchiveId, ArchiveEntryNumber);
|
||||
|
||||
type ArchiveEntryNumber = usize;
|
||||
|
||||
/// All archives and files in them, representing a certain timeline.
|
||||
/// Uses file and archive IDs to reference those without ownership issues.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_files: BTreeMap<FileId, FileEntry>,
|
||||
checkpoint_archives: BTreeMap<ArchiveId, CheckpointArchive>,
|
||||
}
|
||||
|
||||
/// Archive metadata, enough to restore a header with the timeline data.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct CheckpointArchive {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_file_size: u64,
|
||||
files: BTreeSet<FileId>,
|
||||
archive_header_size: u64,
|
||||
}
|
||||
|
||||
impl CheckpointArchive {
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimeline {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
timeline_files: BTreeMap::new(),
|
||||
checkpoint_archives: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn checkpoints(&self) -> impl Iterator<Item = Lsn> + '_ {
|
||||
self.checkpoint_archives
|
||||
.values()
|
||||
.map(CheckpointArchive::disk_consistent_lsn)
|
||||
}
|
||||
|
||||
/// Lists all relish files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
|
||||
self.timeline_files
|
||||
.values()
|
||||
.map(|file_entry| file_entry.subpath.as_path(timeline_dir))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool {
|
||||
self.checkpoint_archives
|
||||
.contains_key(&ArchiveId(disk_consistent_lsn))
|
||||
}
|
||||
|
||||
pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> {
|
||||
self.checkpoint_archives.get(&archive_id)
|
||||
}
|
||||
|
||||
/// Restores a header of a certain remote archive from the memory data.
|
||||
/// Returns the header and its compressed size in the archive, both can be used to uncompress that archive.
|
||||
pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> {
|
||||
let archive = self
|
||||
.checkpoint_archives
|
||||
.get(&archive_id)
|
||||
.ok_or_else(|| anyhow!("Archive {:?} not found", archive_id))?;
|
||||
|
||||
let mut header_files = Vec::with_capacity(archive.files.len());
|
||||
for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
|
||||
let &FileId(archive_id, archive_position) = archive_file;
|
||||
ensure!(
|
||||
expected_archive_position == archive_position,
|
||||
"Archive header is corrupt, file # {} from archive {:?} header is missing",
|
||||
expected_archive_position,
|
||||
archive_id,
|
||||
);
|
||||
|
||||
let timeline_file = self.timeline_files.get(archive_file).ok_or_else(|| {
|
||||
anyhow!(
|
||||
"File with id {:?} not found for archive {:?}",
|
||||
archive_file,
|
||||
archive_id
|
||||
)
|
||||
})?;
|
||||
header_files.push(timeline_file.clone());
|
||||
}
|
||||
|
||||
Ok((
|
||||
ArchiveHeader {
|
||||
files: header_files,
|
||||
metadata_file_size: archive.metadata_file_size,
|
||||
},
|
||||
archive.archive_header_size,
|
||||
))
|
||||
}
|
||||
|
||||
/// Updates (creates, if necessary) the data about certain archive contents.
|
||||
pub fn update_archive_contents(
|
||||
&mut self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
) {
|
||||
let archive_id = ArchiveId(disk_consistent_lsn);
|
||||
let mut common_archive_files = BTreeSet::new();
|
||||
for (file_index, file_entry) in header.files.into_iter().enumerate() {
|
||||
let file_id = FileId(archive_id, file_index);
|
||||
self.timeline_files.insert(file_id, file_entry);
|
||||
common_archive_files.insert(file_id);
|
||||
}
|
||||
|
||||
let metadata_file_size = header.metadata_file_size;
|
||||
self.checkpoint_archives
|
||||
.entry(archive_id)
|
||||
.or_insert_with(|| CheckpointArchive {
|
||||
metadata_file_size,
|
||||
files: BTreeSet::new(),
|
||||
archive_header_size: header_size,
|
||||
disk_consistent_lsn,
|
||||
})
|
||||
.files
|
||||
.extend(common_archive_files.into_iter());
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata abput timeline checkpoint archive, parsed from its remote storage path.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ArchiveDescription {
|
||||
pub header_size: u64,
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
pub archive_name: String,
|
||||
}
|
||||
|
||||
fn try_parse_index_entry(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
conf: &'static PageServerConf,
|
||||
path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenants_dir = conf.tenants_path();
|
||||
let tenant_id = path
|
||||
.strip_prefix(&tenants_dir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Path '{}' does not belong to tenants directory '{}'",
|
||||
path.display(),
|
||||
tenants_dir.display(),
|
||||
)
|
||||
})?
|
||||
.iter()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Found no tenant id in path '{}'", path.display()))?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTenantId>()
|
||||
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
|
||||
|
||||
let branches_path = conf.branches_path(&tenant_id);
|
||||
let timelines_path = conf.timelines_path(&tenant_id);
|
||||
match (
|
||||
RelativePath::new(&branches_path, &path),
|
||||
path.strip_prefix(&timelines_path),
|
||||
) {
|
||||
(Ok(_), Ok(_)) => bail!(
|
||||
"Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
|
||||
path.display(),
|
||||
branches_path.display(),
|
||||
timelines_path.display()
|
||||
),
|
||||
(Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
|
||||
(Err(_), Ok(timelines_subpath)) => {
|
||||
let mut segments = timelines_subpath.iter();
|
||||
let timeline_id = segments
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"{} directory of tenant {} (path '{}') is not an index entry",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
tenant_id,
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTimelineId>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse timeline id from path '{}'", path.display())
|
||||
})?;
|
||||
|
||||
let (disk_consistent_lsn, header_size) =
|
||||
parse_archive_name(path).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse archive name out in path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let archive_name = path
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Archive '{}' has no file name", path.display()))?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
let sync_id = TimelineSyncId(tenant_id, timeline_id);
|
||||
let timeline_index_entry = index
|
||||
.timeline_files
|
||||
.entry(sync_id)
|
||||
.or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new()));
|
||||
match timeline_index_entry {
|
||||
TimelineIndexEntry::Description(descriptions) => {
|
||||
descriptions.insert(
|
||||
ArchiveId(disk_consistent_lsn),
|
||||
ArchiveDescription {
|
||||
header_size,
|
||||
disk_consistent_lsn,
|
||||
archive_name,
|
||||
},
|
||||
);
|
||||
}
|
||||
TimelineIndexEntry::Full(_) => {
|
||||
bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
(Err(branches_error), Err(timelines_strip_error)) => {
|
||||
bail!(
|
||||
"Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
|
||||
path.display(),
|
||||
branches_error,
|
||||
timelines_strip_error,
|
||||
)
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn header_restoration_preserves_file_order() {
|
||||
let header = ArchiveHeader {
|
||||
files: vec![
|
||||
FileEntry {
|
||||
size: 5,
|
||||
subpath: RelativePath("one".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 1,
|
||||
subpath: RelativePath("two".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 222,
|
||||
subpath: RelativePath("zero".to_string()),
|
||||
},
|
||||
],
|
||||
metadata_file_size: 5,
|
||||
};
|
||||
|
||||
let lsn = Lsn(1);
|
||||
let mut remote_timeline = RemoteTimeline::empty();
|
||||
remote_timeline.update_archive_contents(lsn, header.clone(), 15);
|
||||
|
||||
let (restored_header, _) = remote_timeline
|
||||
.restore_header(ArchiveId(lsn))
|
||||
.expect("Should be able to restore header from a valid remote timeline");
|
||||
|
||||
assert_eq!(
|
||||
header, restored_header,
|
||||
"Header restoration should preserve file order"
|
||||
);
|
||||
}
|
||||
}
|
||||
566
pageserver/src/remote_storage/storage_sync/upload.rs
Normal file
566
pageserver/src/remote_storage/storage_sync/upload.rs
Normal file
@@ -0,0 +1,566 @@
|
||||
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
|
||||
//! Currently, tenant branch files are also uploaded, but this does not appear final.
|
||||
|
||||
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use tokio::{fs, sync::RwLock};
|
||||
use tracing::{debug, error, warn};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
compression,
|
||||
index::{RemoteTimeline, TimelineIndexEntry},
|
||||
sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
|
||||
},
|
||||
RemoteStorage, TimelineSyncId,
|
||||
},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint};
|
||||
|
||||
/// Attempts to compress and upload given checkpoint files.
|
||||
/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
|
||||
///
|
||||
/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
/// On success, populates index data with new downloads.
|
||||
pub(super) async fn upload_timeline_checkpoint<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
|
||||
sync_id: TimelineSyncId,
|
||||
new_checkpoint: NewCheckpoint,
|
||||
retries: u32,
|
||||
) -> Option<bool> {
|
||||
debug!("Uploading checkpoint for sync id {}", sync_id);
|
||||
if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.0).await {
|
||||
error!(
|
||||
"Failed to upload missing branches for sync id {}: {:#}",
|
||||
sync_id, e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
|
||||
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let TimelineSyncId(tenant_id, timeline_id) = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let index_read = index.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
None => None,
|
||||
Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)),
|
||||
Some(TimelineIndexEntry::Description(_)) => {
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
|
||||
Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:#}", e);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let already_contains_upload_lsn = remote_timeline
|
||||
.as_ref()
|
||||
.map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn))
|
||||
.unwrap_or(false);
|
||||
if already_contains_upload_lsn {
|
||||
warn!(
|
||||
"Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.",
|
||||
new_upload_lsn
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let already_uploaded_files = remote_timeline
|
||||
.map(|timeline| timeline.stored_files(&timeline_dir))
|
||||
.unwrap_or_default();
|
||||
drop(index_read);
|
||||
|
||||
match try_upload_checkpoint(
|
||||
config,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
&new_checkpoint,
|
||||
already_uploaded_files,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((archive_header, header_size)) => {
|
||||
let mut index_write = index.write().await;
|
||||
match index_write.timeline_entry_mut(&sync_id) {
|
||||
Some(TimelineIndexEntry::Full(remote_timeline)) => {
|
||||
remote_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
}
|
||||
None | Some(TimelineIndexEntry::Description(_)) => {
|
||||
let mut new_timeline = RemoteTimeline::empty();
|
||||
new_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline));
|
||||
}
|
||||
}
|
||||
debug!("Checkpoint uploaded successfully");
|
||||
Some(true)
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to upload checkpoint: {:#}, requeueing the upload",
|
||||
e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
Some(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_upload_checkpoint<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
|
||||
sync_id: TimelineSyncId,
|
||||
new_checkpoint: &NewCheckpoint,
|
||||
files_to_skip: BTreeSet<PathBuf>,
|
||||
) -> anyhow::Result<(ArchiveHeader, u64)> {
|
||||
let TimelineSyncId(tenant_id, timeline_id) = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let files_to_upload = new_checkpoint
|
||||
.layers
|
||||
.iter()
|
||||
.filter(|&path_to_upload| {
|
||||
if files_to_skip.contains(path_to_upload) {
|
||||
error!(
|
||||
"Skipping file upload '{}', since it was already uploaded",
|
||||
path_to_upload.display()
|
||||
);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ensure!(!files_to_upload.is_empty(), "No files to upload");
|
||||
|
||||
compression::archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
files_to_upload.into_iter(),
|
||||
&new_checkpoint.metadata,
|
||||
move |archive_streamer, archive_name| async move {
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.upload(
|
||||
archive_streamer,
|
||||
&remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map(|(header, header_size, _)| (header, header_size))
|
||||
}
|
||||
|
||||
async fn upload_missing_branches<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_branches = tenant_branch_files(config, tenant_id)
|
||||
.await
|
||||
.context("Failed to list local branch files for the tenant")?;
|
||||
let index_read = index.read().await;
|
||||
let remote_branches = index_read
|
||||
.branch_files(tenant_id)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
drop(index_read);
|
||||
|
||||
let mut branch_uploads = local_branches
|
||||
.difference(&remote_branches)
|
||||
.map(|local_only_branch| async move {
|
||||
let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
|
||||
let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to derive a storage path for branch with local path '{}'",
|
||||
local_branch_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_branch_file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&local_branch_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open local branch file {} for reading",
|
||||
local_branch_path.display()
|
||||
)
|
||||
})?;
|
||||
storage
|
||||
.upload(local_branch_file, &storage_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload branch file to the remote path {:?}",
|
||||
storage_path
|
||||
)
|
||||
})?;
|
||||
Ok::<_, anyhow::Error>(local_only_branch)
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut branch_uploads_failed = false;
|
||||
while let Some(upload_result) = branch_uploads.next().await {
|
||||
match upload_result {
|
||||
Ok(local_only_branch) => index
|
||||
.write()
|
||||
.await
|
||||
.add_branch_file(tenant_id, local_only_branch.clone()),
|
||||
Err(e) => {
|
||||
error!("Failed to upload branch file: {:#}", e);
|
||||
branch_uploads_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ensure!(!branch_uploads_failed, "Failed to upload all branch files");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
local_fs::LocalFs,
|
||||
storage_sync::{
|
||||
index::ArchiveId,
|
||||
test_utils::{
|
||||
assert_index_descriptions, create_local_timeline, dummy_metadata,
|
||||
ensure_correct_timeline_upload, expect_timeline,
|
||||
},
|
||||
},
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn reupload_timeline() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline")?;
|
||||
let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
));
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let uploaded_timeline = expect_timeline(index, sync_id).await;
|
||||
let uploaded_archives = uploaded_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
uploaded_archives.len(),
|
||||
1,
|
||||
"Only one archive is expected after a first upload"
|
||||
);
|
||||
let first_uploaded_archive = uploaded_archives.first().copied().unwrap();
|
||||
assert_eq!(
|
||||
uploaded_timeline.checkpoints().last(),
|
||||
Some(first_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline
|
||||
.archive_data(uploaded_archives.first().copied().unwrap())
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline.stored_files(&local_timeline_path),
|
||||
vec![local_timeline_path.join("a"), local_timeline_path.join("b")]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from the first checkpoint"
|
||||
);
|
||||
|
||||
let second_upload_metadata = dummy_metadata(Lsn(0x40));
|
||||
let second_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["b", "c"],
|
||||
second_upload_metadata.clone(),
|
||||
)?;
|
||||
assert!(
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
second_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
2,
|
||||
"Two archives are expected after a successful update of the upload"
|
||||
);
|
||||
updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive);
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let second_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert_eq!(
|
||||
updated_timeline.checkpoints().max(),
|
||||
Some(second_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline
|
||||
.archive_data(second_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
second_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from both checkpoints without duplicates"
|
||||
);
|
||||
|
||||
let third_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
let third_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["d"],
|
||||
third_upload_metadata.clone(),
|
||||
)?;
|
||||
assert_ne!(
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
assert!(
|
||||
third_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
third_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
3,
|
||||
"Three archives are expected after two successful updates of the upload"
|
||||
);
|
||||
updated_archives.retain(|archive_id| {
|
||||
archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive
|
||||
});
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let third_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert!(
|
||||
updated_timeline.checkpoints().max().unwrap()
|
||||
> third_upload_metadata.disk_consistent_lsn(),
|
||||
"Should not influence the last lsn by uploading an older checkpoint"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline
|
||||
.archive_data(third_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("d"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from three checkpoints without duplicates"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reupload_timeline_rejected() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
|
||||
let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
));
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
)
|
||||
.await;
|
||||
let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
|
||||
let normal_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
assert_ne!(
|
||||
normal_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let checkpoint_with_no_files = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&[],
|
||||
normal_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_no_files,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, after_first_uploads.clone()).await;
|
||||
|
||||
let checkpoint_with_uploaded_lsn = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["something", "new"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_uploaded_lsn,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, after_first_uploads.clone()).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -16,11 +16,20 @@ use zenith_utils::zid::ZTimelineId;
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Stops all timeline-related process in the repository and removes the timeline data from memory.
|
||||
fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
/// Updates timeline based on the new sync state, received from the remote storage synchronization.
|
||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||
fn set_timeline_state(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
new_state: TimelineSyncState,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Gets current synchronization state of the timeline.
|
||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||
fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option<TimelineSyncState>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
@@ -34,7 +43,7 @@ pub trait Repository: Send + Sync {
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this funtion is periodically called by gc thread.
|
||||
/// this function is periodically called by gc thread.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
@@ -54,6 +63,43 @@ pub trait Repository: Send + Sync {
|
||||
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A timeline, that belongs to the current repository.
|
||||
pub enum RepositoryTimeline {
|
||||
/// Timeline, with its files present locally in pageserver's working directory.
|
||||
/// Loaded into pageserver's memory and ready to be used.
|
||||
Local(Arc<dyn Timeline>),
|
||||
/// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
|
||||
Remote(ZTimelineId),
|
||||
}
|
||||
|
||||
impl RepositoryTimeline {
|
||||
pub fn local_timeline(&self) -> Option<Arc<dyn Timeline>> {
|
||||
if let Self::Local(local_timeline) = self {
|
||||
Some(Arc::clone(local_timeline))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
|
||||
pub enum TimelineSyncState {
|
||||
/// No further downloads from the remote storage are needed.
|
||||
/// The timeline state is up-to-date or ahead of the remote storage one,
|
||||
/// ready to be used in any pageserver operation.
|
||||
Ready,
|
||||
/// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage.
|
||||
/// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version,
|
||||
/// making it impossible to sync it further.
|
||||
AwaitsDownload,
|
||||
/// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded.
|
||||
/// Cannot be used in any pageserver operations due to complete absence locally.
|
||||
CloudOnly,
|
||||
/// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization.
|
||||
/// Such timelines cannot have their state synchronized further.
|
||||
Evicted,
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
@@ -266,6 +312,7 @@ pub mod repo_harness {
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenant_id))?;
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
@@ -699,7 +746,10 @@ mod tests {
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
|
||||
Some(timeline) => timeline,
|
||||
None => panic!("Should have a local timeline"),
|
||||
};
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
assert!(newtline
|
||||
@@ -757,7 +807,10 @@ mod tests {
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
|
||||
Some(timeline) => timeline,
|
||||
None => panic!("Should have a local timeline"),
|
||||
};
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
@@ -905,7 +958,10 @@ mod tests {
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
|
||||
Some(timeline) => timeline,
|
||||
None => panic!("Should have a local timeline"),
|
||||
};
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
@@ -923,7 +979,10 @@ mod tests {
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
|
||||
Some(timeline) => timeline,
|
||||
None => panic!("Should have a local timeline"),
|
||||
};
|
||||
|
||||
make_some_layers(&newtline, Lsn(0x60))?;
|
||||
|
||||
@@ -934,12 +993,17 @@ mod tests {
|
||||
let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id);
|
||||
|
||||
let expected_image_layer_path = tline_dir.join(format!(
|
||||
"rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
|
||||
"rel_{}_{}_{}_{}_{}_rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
|
||||
TESTREL_A_REL_TAG.spcnode,
|
||||
TESTREL_A_REL_TAG.dbnode,
|
||||
TESTREL_A_REL_TAG.relnode,
|
||||
TESTREL_A_REL_TAG.forknum,
|
||||
0, // seg is 0
|
||||
TESTREL_A_REL_TAG.spcnode,
|
||||
TESTREL_A_REL_TAG.dbnode,
|
||||
TESTREL_A_REL_TAG.relnode,
|
||||
TESTREL_A_REL_TAG.forknum,
|
||||
1, // end seg is 1
|
||||
0x20,
|
||||
0x30,
|
||||
));
|
||||
@@ -948,6 +1012,55 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_beyond_eof() -> Result<()> {
|
||||
let harness = RepoHarness::create("test_read_beyond_eof")?;
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Lsn(0x60),
|
||||
TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x50))),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
}
|
||||
|
||||
// Test read before rel creation. Should error out.
|
||||
assert!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err());
|
||||
|
||||
// Read block beyond end of relation at different points in time.
|
||||
// These reads should fall into different delta, image, and in-memory layers.
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE);
|
||||
|
||||
// Test on an in-memory layer with no preceding layer
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put_page_image(
|
||||
TESTREL_B,
|
||||
0,
|
||||
Lsn(0x70),
|
||||
TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x70));
|
||||
}
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
@@ -1038,13 +1151,13 @@ mod tests {
|
||||
// These files are considered to be in the future and will be renamed out
|
||||
// of the way
|
||||
let future_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8001),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008),
|
||||
format!("pg_control_0_pg_control_1_{:016X}", 0x8001),
|
||||
format!("pg_control_0_pg_control_1_{:016X}_{:016X}", 0x8001, 0x8008),
|
||||
];
|
||||
// But these are not:
|
||||
let past_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8000),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001),
|
||||
format!("pg_control_0_pg_control_1_{:016X}", 0x8000),
|
||||
format!("pg_control_0_pg_control_1_{:016X}_{:016X}", 0x7000, 0x8001),
|
||||
];
|
||||
|
||||
for filename in future_filenames.iter().chain(past_filenames.iter()) {
|
||||
|
||||
@@ -16,9 +16,10 @@ use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use crate::walrecord::*;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::repository::{Repository, Timeline, TimelineSyncState};
|
||||
use crate::tenant_threads;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
@@ -11,10 +11,8 @@ use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::fmt;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
@@ -30,11 +28,6 @@ struct Tenant {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TenantState {
|
||||
// This tenant only exists in cloud storage. It cannot be accessed.
|
||||
CloudOnly,
|
||||
// This tenant exists in cloud storage, and we are currently downloading it to local disk.
|
||||
// It cannot be accessed yet, not until it's been fully downloaded to local disk.
|
||||
Downloading,
|
||||
// All data for this tenant is complete on local disk, but we haven't loaded the Repository,
|
||||
// Timeline and Layer structs into memory yet, so it cannot be accessed yet.
|
||||
//Ready,
|
||||
@@ -49,22 +42,9 @@ pub enum TenantState {
|
||||
Stopping,
|
||||
}
|
||||
|
||||
/// A remote storage timeline synchronization event, that needs another step
|
||||
/// to be fully completed.
|
||||
#[derive(Debug)]
|
||||
pub enum PostTimelineSyncStep {
|
||||
/// The timeline cannot be synchronized anymore due to some sync issues.
|
||||
/// Needs to be removed from pageserver, to avoid further data diverging.
|
||||
Evict,
|
||||
/// A new timeline got downloaded and needs to be loaded into pageserver.
|
||||
RegisterDownload,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
TenantState::CloudOnly => f.write_str("CloudOnly"),
|
||||
TenantState::Downloading => f.write_str("Downloading"),
|
||||
TenantState::Active => f.write_str("Active"),
|
||||
TenantState::Idle => f.write_str("Idle"),
|
||||
TenantState::Stopping => f.write_str("Stopping"),
|
||||
@@ -78,101 +58,69 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
|
||||
init_repo(conf, tenantid);
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
}
|
||||
|
||||
pub fn perform_post_timeline_sync_steps(
|
||||
/// Updates tenants' repositories, changing their timelines state in memory.
|
||||
pub fn set_timeline_states(
|
||||
conf: &'static PageServerConf,
|
||||
post_sync_steps: HashMap<(ZTenantId, ZTimelineId), PostTimelineSyncStep>,
|
||||
timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
|
||||
) {
|
||||
if post_sync_steps.is_empty() {
|
||||
if timeline_states.is_empty() {
|
||||
debug!("no timeline state updates to perform");
|
||||
return;
|
||||
}
|
||||
|
||||
info!("Performing {} post-sync steps", post_sync_steps.len());
|
||||
trace!("Steps: {:?}", post_sync_steps);
|
||||
info!("Updating states for {} timelines", timeline_states.len());
|
||||
trace!("States: {:?}", timeline_states);
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
for &(tenant_id, timeline_id) in post_sync_steps.keys() {
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Downloading,
|
||||
repo: None,
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => {
|
||||
init_timeline(repo.as_ref(), timeline_id);
|
||||
tenant.state = TenantState::Idle;
|
||||
return;
|
||||
}
|
||||
None => log::warn!("Initialize new repo"),
|
||||
}
|
||||
tenant.state = TenantState::Idle;
|
||||
}
|
||||
}
|
||||
|
||||
for ((tenant_id, timeline_id), post_sync_step) in post_sync_steps {
|
||||
match post_sync_step {
|
||||
PostTimelineSyncStep::Evict => {
|
||||
if let Err(e) = get_repository_for_tenant(tenant_id)
|
||||
.and_then(|repo| repo.unload_timeline(timeline_id))
|
||||
{
|
||||
error!(
|
||||
"Failed to remove repository for tenant {}, timeline {}: {:#}",
|
||||
tenant_id, timeline_id, e
|
||||
)
|
||||
}
|
||||
}
|
||||
PostTimelineSyncStep::RegisterDownload => {
|
||||
// init repo updates Tenant state
|
||||
init_repo(conf, tenant_id);
|
||||
let new_repo = get_repository_for_tenant(tenant_id).unwrap();
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
}
|
||||
let mut m = access_tenants();
|
||||
for (tenant_id, timeline_states) in timeline_states {
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo: None,
|
||||
});
|
||||
if let Err(e) = put_timelines_into_tenant(conf, tenant, tenant_id, timeline_states) {
|
||||
error!(
|
||||
"Failed to update timeline states for tenant {}: {:#}",
|
||||
tenant_id, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
fn put_timelines_into_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant: &mut Tenant,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
|
||||
) -> anyhow::Result<()> {
|
||||
let repo = match tenant.repo.as_ref() {
|
||||
Some(repo) => Arc::clone(repo),
|
||||
None => {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
tenant.repo = Some(Arc::clone(&repo));
|
||||
repo
|
||||
}
|
||||
};
|
||||
|
||||
for (timeline_id, timeline_state) in timeline_states {
|
||||
repo.set_timeline_state(timeline_id, timeline_state)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to update timeline {} state to {:?}",
|
||||
timeline_id, timeline_state
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Check this flag in the thread loops to know when to exit
|
||||
@@ -204,37 +152,24 @@ pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
}
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||
let repo = Some(branches::create_repo(conf, tenantid, wal_redo_manager)?);
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
match access_tenants().entry(tenantid) {
|
||||
hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If tenant is not found in the repository, return CloudOnly state
|
||||
pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
|
||||
let m = access_tenants();
|
||||
match m.get(&tenantid) {
|
||||
Some(tenant) => tenant.state,
|
||||
None => TenantState::CloudOnly,
|
||||
}
|
||||
pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
|
||||
Some(access_tenants().get(&tenantid)?.state)
|
||||
}
|
||||
|
||||
pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
|
||||
@@ -251,7 +186,7 @@ pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<Te
|
||||
tenant.state = newstate;
|
||||
Ok(tenant.state)
|
||||
}
|
||||
None => bail!("Tenant not found for tenant {}", tenantid),
|
||||
None => bail!("Tenant not found for id {}", tenantid),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,13 +207,14 @@ pub fn get_timeline_for_tenant(
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
get_repository_for_tenant(tenantid)?
|
||||
.get_timeline(timelineid)
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
.get_timeline(timelineid)?
|
||||
.local_timeline()
|
||||
.ok_or_else(|| anyhow!("cannot fetch timeline {}", timelineid))
|
||||
}
|
||||
|
||||
fn list_tenantids() -> Result<Vec<ZTenantId>> {
|
||||
let m = access_tenants();
|
||||
m.iter()
|
||||
access_tenants()
|
||||
.iter()
|
||||
.map(|v| {
|
||||
let (tenantid, _) = v;
|
||||
Ok(*tenantid)
|
||||
@@ -294,8 +230,8 @@ pub struct TenantInfo {
|
||||
}
|
||||
|
||||
pub fn list_tenants() -> Result<Vec<TenantInfo>> {
|
||||
let m = access_tenants();
|
||||
m.iter()
|
||||
access_tenants()
|
||||
.iter()
|
||||
.map(|v| {
|
||||
let (id, tenant) = v;
|
||||
Ok(TenantInfo {
|
||||
|
||||
@@ -88,7 +88,7 @@ fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result
|
||||
}
|
||||
|
||||
trace!(
|
||||
"checkpointer thread stopped for tenant {} state is {}",
|
||||
"checkpointer thread stopped for tenant {} state is {:?}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
@@ -120,7 +120,7 @@ fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -135,13 +135,14 @@ fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
// TODO Write it in more adequate way using
|
||||
// condvar.wait_timeout() or something
|
||||
let mut sleep_time = conf.gc_period.as_secs();
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
|
||||
{
|
||||
sleep_time -= 1;
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"GC thread stopped for tenant {} state is {}",
|
||||
"GC thread stopped for tenant {} state is {:?}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
|
||||
@@ -10,13 +10,14 @@ use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::tenant_threads;
|
||||
use crate::waldecoder::*;
|
||||
use crate::walrecord::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
use anyhow::{bail, Context, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
@@ -204,7 +205,13 @@ fn walreceiver_main(
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let mut caught_up = false;
|
||||
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
let timeline =
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| {
|
||||
format!(
|
||||
"Can not start the walrecever for a remote tenant {}, timeline {}",
|
||||
tenantid, timelineid,
|
||||
)
|
||||
})?;
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
|
||||
@@ -1,220 +1,15 @@
|
||||
//!
|
||||
//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, so that they can be stored in repository.
|
||||
//! Functions for parsing WAL records.
|
||||
//!
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::XLogLongPageHeaderData;
|
||||
use postgres_ffi::XLogPageHeaderData;
|
||||
use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD};
|
||||
use postgres_ffi::XLogRecord;
|
||||
use postgres_ffi::{BlockNumber, OffsetNumber};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use tracing::*;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
|
||||
startlsn: Lsn, // LSN where this record starts
|
||||
contlen: u32,
|
||||
padlen: u32,
|
||||
|
||||
inputbuf: BytesMut,
|
||||
|
||||
recordbuf: BytesMut,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
//
|
||||
// WalRecordStream is a Stream that returns a stream of WAL records
|
||||
// FIXME: This isn't a proper rust stream
|
||||
//
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: Lsn) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
|
||||
startlsn: Lsn(0),
|
||||
contlen: 0,
|
||||
padlen: 0,
|
||||
|
||||
inputbuf: BytesMut::new(),
|
||||
recordbuf: BytesMut::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// The latest LSN position fed to the decoder.
|
||||
pub fn available(&self) -> Lsn {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
/// Attempt to decode another WAL record from the input that has been fed to the
|
||||
/// decoder so far.
|
||||
///
|
||||
/// Returns one of the following:
|
||||
/// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
let recordbuf;
|
||||
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||
continue;
|
||||
} else if self.lsn.block_offset() == 0 {
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// skip padding
|
||||
self.inputbuf.advance(self.padlen as usize);
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
// need to have at least the xl_tot_len field
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// peek xl_tot_len at the beginning of the record.
|
||||
// FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// Fast path for the common case that the whole record fits on the page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
|
||||
// Take the record from the 'inputbuf', and validate it.
|
||||
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
|
||||
self.lsn += xl_tot_len as u64;
|
||||
break;
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.contlen = xl_tot_len;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
|
||||
// read the rest of the record, or as much as fits on this page.
|
||||
let n = min(self.contlen, pageleft) as usize;
|
||||
|
||||
if self.inputbuf.remaining() < n {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
self.recordbuf.put(self.inputbuf.split_to(n));
|
||||
self.lsn += n as u64;
|
||||
self.contlen -= n as u32;
|
||||
|
||||
if self.contlen == 0 {
|
||||
// The record is now complete.
|
||||
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
/// DecodedBkpBlock represents per-page data contained in a WAL record.
|
||||
#[derive(Default)]
|
||||
pub struct DecodedBkpBlock {
|
||||
/* Is this block ref in use? */
|
||||
@@ -43,8 +43,8 @@ use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlMultiXactCreate;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::walrecord::XlMultiXactCreate;
|
||||
use crate::walrecord::XlXactParsedRecord;
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
|
||||
|
||||
@@ -13,6 +13,7 @@ pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod waldecoder;
|
||||
pub mod xlog_utils;
|
||||
|
||||
// See TransactionIdIsNormal in transam.h
|
||||
|
||||
219
postgres_ffi/src/waldecoder.rs
Normal file
219
postgres_ffi/src/waldecoder.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
//!
|
||||
//! Basic WAL stream decoding.
|
||||
//!
|
||||
//! This understands the WAL page and record format, enough to figure out where the WAL record
|
||||
//! boundaries are, and to reassemble WAL records that cross page boundaries.
|
||||
//!
|
||||
//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs
|
||||
//! to look deeper into the WAL records to also understand which blocks they modify, the code
|
||||
//! for that is in pageserver/src/walrecord.rs
|
||||
//!
|
||||
use super::pg_constants;
|
||||
use super::xlog_utils::*;
|
||||
use super::XLogLongPageHeaderData;
|
||||
use super::XLogPageHeaderData;
|
||||
use super::XLogRecord;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
|
||||
startlsn: Lsn, // LSN where this record starts
|
||||
contlen: u32,
|
||||
padlen: u32,
|
||||
|
||||
inputbuf: BytesMut,
|
||||
|
||||
/// buffer used to reassemble records that cross page boundaries.
|
||||
recordbuf: BytesMut,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
//
|
||||
// WalRecordStream is a Stream that returns a stream of WAL records
|
||||
// FIXME: This isn't a proper rust stream
|
||||
//
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: Lsn) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
|
||||
startlsn: Lsn(0),
|
||||
contlen: 0,
|
||||
padlen: 0,
|
||||
|
||||
inputbuf: BytesMut::new(),
|
||||
recordbuf: BytesMut::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// The latest LSN position fed to the decoder.
|
||||
pub fn available(&self) -> Lsn {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
/// Attempt to decode another WAL record from the input that has been fed to the
|
||||
/// decoder so far.
|
||||
///
|
||||
/// Returns one of the following:
|
||||
/// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
let recordbuf;
|
||||
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||
continue;
|
||||
} else if self.lsn.block_offset() == 0 {
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// skip padding
|
||||
self.inputbuf.advance(self.padlen as usize);
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
// need to have at least the xl_tot_len field
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// peek xl_tot_len at the beginning of the record.
|
||||
// FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// Fast path for the common case that the whole record fits on the page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
|
||||
// Take the record from the 'inputbuf', and validate it.
|
||||
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
|
||||
self.lsn += xl_tot_len as u64;
|
||||
break;
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.contlen = xl_tot_len;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
|
||||
// read the rest of the record, or as much as fits on this page.
|
||||
let n = min(self.contlen, pageleft) as usize;
|
||||
|
||||
if self.inputbuf.remaining() < n {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
self.recordbuf.put(self.inputbuf.split_to(n));
|
||||
self.lsn += n as u64;
|
||||
self.contlen -= n as u32;
|
||||
|
||||
if self.contlen == 0 {
|
||||
// The record is now complete.
|
||||
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,6 @@ tokio = "1.11"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
clap = "2.33.0"
|
||||
rustls = "0.19.1"
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -96,14 +96,14 @@ def do_copy(args):
|
||||
src = args.src
|
||||
dst = args.dst
|
||||
|
||||
try:
|
||||
if src.is_dir():
|
||||
shutil.copytree(src, dst)
|
||||
else:
|
||||
shutil.copy(src, dst)
|
||||
except FileExistsError:
|
||||
if args.forbid_overwrite:
|
||||
raise
|
||||
if args.forbid_overwrite and dst.exists():
|
||||
raise FileExistsError(f"File exists: '{dst}'")
|
||||
|
||||
if src.is_dir():
|
||||
shutil.rmtree(dst, ignore_errors=True)
|
||||
shutil.copytree(src, dst)
|
||||
else:
|
||||
shutil.copy(src, dst)
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -70,7 +70,8 @@ def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
|
||||
cur = conn.cursor()
|
||||
|
||||
# check same tenant cannot be created twice
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
|
||||
with pytest.raises(psycopg2.DatabaseError,
|
||||
match=f'repo for {env.initial_tenant} already exists'):
|
||||
cur.execute(f'tenant_create {env.initial_tenant}')
|
||||
|
||||
# create one more tenant
|
||||
|
||||
@@ -340,6 +340,7 @@ class ProposerPostgres:
|
||||
"synchronous_standby_names = 'walproposer'\n",
|
||||
f"zenith.zenith_timeline = '{self.timeline_id}'\n",
|
||||
f"zenith.zenith_tenant = '{self.tenant_id}'\n",
|
||||
f"zenith.page_server_connstring = ''\n",
|
||||
f"wal_acceptors = '{wal_acceptors}'\n",
|
||||
])
|
||||
|
||||
|
||||
@@ -1170,6 +1170,40 @@ def test_output_dir(request: Any) -> str:
|
||||
return test_dir
|
||||
|
||||
|
||||
SKIP_DIRS = frozenset(('pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical'))
|
||||
|
||||
SKIP_FILES = frozenset(('pg_internal.init',
|
||||
'pg.log',
|
||||
'zenith.signal',
|
||||
'postgresql.conf',
|
||||
'postmaster.opts',
|
||||
'postmaster.pid',
|
||||
'pg_control'))
|
||||
|
||||
|
||||
def should_skip_dir(dirname: str) -> bool:
|
||||
return dirname in SKIP_DIRS
|
||||
|
||||
|
||||
def should_skip_file(filename: str) -> bool:
|
||||
if filename in SKIP_FILES:
|
||||
return True
|
||||
# check for temp table files according to https://www.postgresql.org/docs/current/storage-file-layout.html
|
||||
# i e "tBBB_FFF"
|
||||
if not filename.startswith('t'):
|
||||
return False
|
||||
|
||||
tmp_name = filename[1:].split('_')
|
||||
if len(tmp_name) != 2:
|
||||
return False
|
||||
|
||||
try:
|
||||
list(map(int, tmp_name))
|
||||
except:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
#
|
||||
# Test helpers
|
||||
#
|
||||
@@ -1179,19 +1213,10 @@ def list_files_to_compare(pgdata_dir: str):
|
||||
for filename in filenames:
|
||||
rel_dir = os.path.relpath(root, pgdata_dir)
|
||||
# Skip some dirs and files we don't want to compare
|
||||
skip_dirs = ['pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical']
|
||||
skip_files = [
|
||||
'pg_internal.init',
|
||||
'pg.log',
|
||||
'zenith.signal',
|
||||
'postgresql.conf',
|
||||
'postmaster.opts',
|
||||
'postmaster.pid',
|
||||
'pg_control'
|
||||
]
|
||||
if rel_dir not in skip_dirs and filename not in skip_files:
|
||||
rel_file = os.path.join(rel_dir, filename)
|
||||
pgdata_files.append(rel_file)
|
||||
if should_skip_dir(rel_dir) or should_skip_file(filename):
|
||||
continue
|
||||
rel_file = os.path.join(rel_dir, filename)
|
||||
pgdata_files.append(rel_file)
|
||||
|
||||
pgdata_files.sort()
|
||||
log.info(pgdata_files)
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: be8bdba074...a97cfe8ed7
@@ -8,7 +8,6 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
regex = "1.4.5"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
hyper = "0.14"
|
||||
@@ -16,12 +15,11 @@ routerify = "2"
|
||||
fs2 = "0.4.3"
|
||||
lazy_static = "1.4.0"
|
||||
serde_json = "1"
|
||||
log = "0.4.14"
|
||||
tracing = "0.1.27"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
|
||||
tokio = "1.11"
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
anyhow = "1.0"
|
||||
@@ -32,9 +30,8 @@ signal-hook = "0.3.10"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -5,18 +5,21 @@ use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use tracing::*;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::{logging, tcp_listener, GIT_VERSION};
|
||||
|
||||
use tokio::sync::mpsc;
|
||||
use walkeeper::callmemaybe;
|
||||
use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use walkeeper::http;
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::SafeKeeperConf;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals;
|
||||
use zenith_utils::{logging, tcp_listener, GIT_VERSION};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("safekeeper");
|
||||
@@ -44,6 +47,9 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
// FIXME this argument is no longer needed since pageserver address is forwarded from compute.
|
||||
// However because this argument is in use by console's e2e tests lets keep it for now and remove separately.
|
||||
// So currently it is a noop.
|
||||
.arg(
|
||||
Arg::with_name("pageserver")
|
||||
.short("p")
|
||||
@@ -101,16 +107,12 @@ fn main() -> Result<()> {
|
||||
conf.listen_http_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("pageserver") {
|
||||
conf.pageserver_addr = Some(addr.to_owned());
|
||||
}
|
||||
|
||||
if let Some(ttl) = arg_matches.value_of("ttl") {
|
||||
conf.ttl = Some(humantime::parse_duration(ttl)?);
|
||||
}
|
||||
|
||||
if let Some(recall) = arg_matches.value_of("recall") {
|
||||
conf.recall_period = Some(humantime::parse_duration(recall)?);
|
||||
conf.recall_period = humantime::parse_duration(recall)?;
|
||||
}
|
||||
|
||||
start_safekeeper(conf)
|
||||
@@ -182,16 +184,35 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
);
|
||||
}
|
||||
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
let thread_result = wal_service::thread_main(conf, pg_listener);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
})?,
|
||||
);
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let conf_cloned = conf.clone();
|
||||
let wal_acceptor_thread = thread::Builder::new()
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
threads.push(wal_acceptor_thread);
|
||||
|
||||
let callmemaybe_thread = thread::Builder::new()
|
||||
.name("callmemaybe thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = callmemaybe::thread_main(conf, rx);
|
||||
if let Err(e) = thread_result {
|
||||
error!("callmemaybe thread terminated: {}", e);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(callmemaybe_thread);
|
||||
|
||||
// TODO: put more thoughts into handling of failed threads
|
||||
// We probably should restart them.
|
||||
|
||||
// NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
|
||||
signals.handle(|signal| {
|
||||
|
||||
249
walkeeper/src/callmemaybe.rs
Normal file
249
walkeeper/src/callmemaybe.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
//!
|
||||
//! Callmemaybe module is responsible for periodically requesting
|
||||
//! pageserver to initiate wal streaming.
|
||||
//!
|
||||
//! Other threads can use CallmeEvent messages to subscribe or unsubscribe
|
||||
//! from the call list.
|
||||
//!
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::runtime;
|
||||
use tokio::sync::mpsc::UnboundedReceiver;
|
||||
use tokio::task;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::*;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
async fn request_callback(
|
||||
pageserver_connstr: String,
|
||||
listen_pg_addr_str: String,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
debug!(
|
||||
"callmemaybe request_callback Connecting to pageserver {}",
|
||||
&pageserver_connstr
|
||||
);
|
||||
let (client, connection) = tokio_postgres::connect(&pageserver_connstr, NoTls).await?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str);
|
||||
let me_conf: postgres::config::Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
|
||||
let callme = format!(
|
||||
"callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'",
|
||||
tenantid, timelineid, host, port, timelineid, tenantid
|
||||
);
|
||||
|
||||
let _ = client.simple_query(&callme).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf, rx: UnboundedReceiver<CallmeEvent>) -> Result<()> {
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
runtime.block_on(main_loop(conf, rx))
|
||||
}
|
||||
|
||||
/// Messages to the callmemaybe thread
|
||||
#[derive(Debug)]
|
||||
pub enum CallmeEvent {
|
||||
// add new subscription to the list
|
||||
Subscribe(ZTenantId, ZTimelineId, String),
|
||||
// remove the subscription from the list
|
||||
Unsubscribe(ZTenantId, ZTimelineId),
|
||||
// don't serve this subscription, but keep it in the list
|
||||
Pause(ZTenantId, ZTimelineId),
|
||||
// resume this subscription, if it exists,
|
||||
// but don't create a new one if it is gone
|
||||
Resume(ZTenantId, ZTimelineId),
|
||||
}
|
||||
|
||||
struct SubscriptionState {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
pageserver_connstr: String,
|
||||
handle: Option<task::JoinHandle<()>>,
|
||||
last_call_time: Instant,
|
||||
paused: bool,
|
||||
}
|
||||
|
||||
impl SubscriptionState {
|
||||
fn new(
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
pageserver_connstr: String,
|
||||
) -> SubscriptionState {
|
||||
SubscriptionState {
|
||||
tenantid,
|
||||
timelineid,
|
||||
pageserver_connstr,
|
||||
handle: None,
|
||||
last_call_time: Instant::now(),
|
||||
paused: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn pause(&mut self) {
|
||||
self.paused = true;
|
||||
self.abort_handle();
|
||||
}
|
||||
|
||||
fn resume(&mut self) {
|
||||
self.paused = false;
|
||||
}
|
||||
|
||||
// Most likely, the task have already successfully completed
|
||||
// and abort() won't have any effect.
|
||||
fn abort_handle(&mut self) {
|
||||
if let Some(handle) = self.handle.take() {
|
||||
handle.abort();
|
||||
|
||||
let timelineid = self.timelineid;
|
||||
let tenantid = self.tenantid;
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = handle.await {
|
||||
if err.is_cancelled() {
|
||||
warn!("callback task for timelineid={} tenantid={} was cancelled before spawning a new one",
|
||||
timelineid, tenantid);
|
||||
} else {
|
||||
error!(
|
||||
"callback task for timelineid={} tenantid={} failed: {}",
|
||||
timelineid, tenantid, err
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn call(&mut self, recall_period: Duration, listen_pg_addr: String) {
|
||||
// Ignore call request if this subscription is paused
|
||||
if self.paused {
|
||||
debug!(
|
||||
"ignore call request for paused subscription
|
||||
tenantid: {}, timelineid: {}",
|
||||
self.tenantid, self.timelineid
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if it too early to recall
|
||||
if self.handle.is_some() && self.last_call_time.elapsed() < recall_period {
|
||||
debug!(
|
||||
"too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?}
|
||||
tenantid: {}, timelineid: {}",
|
||||
self.last_call_time, recall_period, self.tenantid, self.timelineid
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// If previous task didn't complete in recall_period, it must be hanging,
|
||||
// so don't wait for it forever, just abort it and try again.
|
||||
self.abort_handle();
|
||||
|
||||
let timelineid = self.timelineid;
|
||||
let tenantid = self.tenantid;
|
||||
let pageserver_connstr = self.pageserver_connstr.clone();
|
||||
self.handle = Some(tokio::spawn(async move {
|
||||
request_callback(pageserver_connstr, listen_pg_addr, timelineid, tenantid)
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
error!(
|
||||
"callback task for timelineid={} tenantid={} failed: {}",
|
||||
timelineid, tenantid, e
|
||||
)
|
||||
});
|
||||
}));
|
||||
|
||||
// Update last_call_time
|
||||
self.last_call_time = Instant::now();
|
||||
debug!(
|
||||
"new call spawned. time {:?}
|
||||
tenantid: {}, timelineid: {}",
|
||||
self.last_call_time, self.tenantid, self.timelineid
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SubscriptionState {
|
||||
fn drop(&mut self) {
|
||||
self.abort_handle();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver<CallmeEvent>) -> Result<()> {
|
||||
let subscriptions: Mutex<HashMap<(ZTenantId, ZTimelineId), SubscriptionState>> =
|
||||
Mutex::new(HashMap::new());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
request = rx.recv() =>
|
||||
{
|
||||
match request.ok_or_else(|| anyhow!("done"))?
|
||||
{
|
||||
CallmeEvent::Subscribe(tenantid, timelineid, pageserver_connstr) =>
|
||||
{
|
||||
let mut subscriptions = subscriptions.lock().unwrap();
|
||||
if let Some(mut sub) = subscriptions.insert((tenantid, timelineid),
|
||||
SubscriptionState::new(tenantid, timelineid, pageserver_connstr))
|
||||
{
|
||||
sub.call(conf.recall_period, conf.listen_pg_addr.clone());
|
||||
}
|
||||
debug!("callmemaybe. thread_main. subscribe callback request for timelineid={} tenantid={}",
|
||||
timelineid, tenantid);
|
||||
},
|
||||
CallmeEvent::Unsubscribe(tenantid, timelineid) => {
|
||||
let mut subscriptions = subscriptions.lock().unwrap();
|
||||
subscriptions.remove(&(tenantid, timelineid));
|
||||
debug!("callmemaybe. thread_main. unsubscribe callback request for timelineid={} tenantid={}",
|
||||
timelineid, tenantid);
|
||||
},
|
||||
CallmeEvent::Pause(tenantid, timelineid) => {
|
||||
let mut subscriptions = subscriptions.lock().unwrap();
|
||||
if let Some(sub) = subscriptions.get_mut(&(tenantid, timelineid))
|
||||
{
|
||||
sub.pause();
|
||||
};
|
||||
debug!("callmemaybe. thread_main. pause callback request for timelineid={} tenantid={}",
|
||||
timelineid, tenantid);
|
||||
},
|
||||
CallmeEvent::Resume(tenantid, timelineid) => {
|
||||
let mut subscriptions = subscriptions.lock().unwrap();
|
||||
if let Some(sub) = subscriptions.get_mut(&(tenantid, timelineid))
|
||||
{
|
||||
sub.resume();
|
||||
sub.call(conf.recall_period, conf.listen_pg_addr.clone());
|
||||
};
|
||||
|
||||
debug!("callmemaybe. thread_main. resume callback request for timelineid={} tenantid={}",
|
||||
timelineid, tenantid);
|
||||
},
|
||||
}
|
||||
},
|
||||
_ = tokio::time::sleep(conf.recall_period) => {
|
||||
let mut subscriptions = subscriptions.lock().unwrap();
|
||||
|
||||
for (&(_tenantid, _timelineid), state) in subscriptions.iter_mut() {
|
||||
state.call(conf.recall_period, conf.listen_pg_addr.clone());
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,8 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use crc32c::crc32c_append;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
|
||||
use crate::safekeeper::{
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use std::env;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
pub mod callmemaybe;
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
pub mod receive_wal;
|
||||
@@ -13,16 +13,19 @@ pub mod s3_offload;
|
||||
pub mod safekeeper;
|
||||
pub mod send_wal;
|
||||
pub mod timeline;
|
||||
pub mod upgrade;
|
||||
pub mod wal_service;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
use std::time::Duration;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(1);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -39,11 +42,8 @@ pub struct SafeKeeperConf {
|
||||
pub no_sync: bool,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub pageserver_addr: Option<String>,
|
||||
// TODO (create issue) this is temporary, until protocol between PG<->SK<->PS rework
|
||||
pub pageserver_auth_token: Option<String>,
|
||||
pub ttl: Option<Duration>,
|
||||
pub recall_period: Option<Duration>,
|
||||
pub recall_period: Duration,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -61,12 +61,10 @@ impl Default for SafeKeeperConf {
|
||||
workdir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
|
||||
recall_period: defaults::DEFAULT_RECALL_PERIOD,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,80 +5,43 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
use postgres::{Client, Config, NoTls};
|
||||
use tracing::*;
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
|
||||
use crate::safekeeper::AcceptorProposerMessage;
|
||||
use crate::safekeeper::ProposerAcceptorMessage;
|
||||
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::TimelineTools;
|
||||
use crate::SafeKeeperConf;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
|
||||
pub struct ReceiveWalConn<'pg> {
|
||||
/// Postgres connection
|
||||
pg_backend: &'pg mut PostgresBackend,
|
||||
/// The cached result of `pg_backend.socket().peer_addr()` (roughly)
|
||||
peer_addr: SocketAddr,
|
||||
}
|
||||
|
||||
///
|
||||
/// Periodically request pageserver to call back.
|
||||
/// If pageserver already has replication channel, it will just ignore this request
|
||||
///
|
||||
fn request_callback(conf: SafeKeeperConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let ps_addr = conf.pageserver_addr.unwrap();
|
||||
let ps_connstr = format!(
|
||||
"postgresql://no_user:{}@{}/no_db",
|
||||
&conf.pageserver_auth_token.unwrap_or_default(),
|
||||
ps_addr
|
||||
);
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_pg_addr);
|
||||
let me_conf: Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
let callme = format!(
|
||||
"callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'",
|
||||
tenantid, timelineid, host, port, timelineid, tenantid,
|
||||
);
|
||||
|
||||
loop {
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
);
|
||||
match Client::connect(&ps_connstr, NoTls) {
|
||||
Ok(mut client) => {
|
||||
if let Err(e) = client.simple_query(&callme) {
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
}
|
||||
}
|
||||
Err(e) => error!("Failed to connect to pageserver {}: {}", &ps_connstr, e),
|
||||
}
|
||||
|
||||
if let Some(period) = conf.recall_period {
|
||||
sleep(period);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
/// Pageserver connection string forwarded from compute
|
||||
/// NOTE that it is allowed to operate without a pageserver.
|
||||
/// So if compute has no pageserver configured do not use it.
|
||||
pageserver_connstr: Option<String>,
|
||||
}
|
||||
|
||||
impl<'pg> ReceiveWalConn<'pg> {
|
||||
pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> {
|
||||
pub fn new(
|
||||
pg: &'pg mut PostgresBackend,
|
||||
pageserver_connstr: Option<String>,
|
||||
) -> ReceiveWalConn<'pg> {
|
||||
let peer_addr = *pg.get_peer_addr();
|
||||
ReceiveWalConn {
|
||||
pg_backend: pg,
|
||||
peer_addr,
|
||||
pageserver_connstr,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,6 +70,8 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
|
||||
/// Receive WAL from wal_proposer
|
||||
pub fn run(&mut self, swh: &mut SendWalHandler) -> Result<()> {
|
||||
let _enter = info_span!("WAL acceptor", timeline = %swh.timelineid.unwrap()).entered();
|
||||
|
||||
// Notify the libpq client that it's allowed to send `CopyData` messages
|
||||
self.pg_backend
|
||||
.write_message(&BeMessage::CopyBothResponse)?;
|
||||
@@ -128,19 +93,38 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
}
|
||||
|
||||
// if requested, ask pageserver to fetch wal from us
|
||||
// xxx: this place seems not really fitting
|
||||
if swh.conf.pageserver_addr.is_some() {
|
||||
// Need to establish replication channel with page server.
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
let conf = swh.conf.clone();
|
||||
let timelineid = swh.timeline.get().timelineid;
|
||||
let _ = thread::Builder::new()
|
||||
.name("request_callback thread".into())
|
||||
.spawn(move || {
|
||||
request_callback(conf, timelineid, tenant_id);
|
||||
// as long as this wal_stream is alive, callmemaybe thread
|
||||
// will send requests to pageserver
|
||||
let _guard = match self.pageserver_connstr {
|
||||
Some(ref pageserver_connstr) => {
|
||||
// Need to establish replication channel with page server.
|
||||
// Add far as replication in postgres is initiated by receiver
|
||||
// we should use callmemaybe mechanism.
|
||||
let timelineid = swh.timeline.get().timelineid;
|
||||
let tx_clone = swh.tx.clone();
|
||||
let pageserver_connstr = pageserver_connstr.to_owned();
|
||||
swh.tx
|
||||
.send(CallmeEvent::Subscribe(
|
||||
tenant_id,
|
||||
timelineid,
|
||||
pageserver_connstr,
|
||||
))
|
||||
.unwrap_or_else(|e| {
|
||||
error!(
|
||||
"failed to send Subscribe request to callmemaybe thread {}",
|
||||
e
|
||||
);
|
||||
});
|
||||
|
||||
// create a guard to unsubscribe callback, when this wal_stream will exit
|
||||
Some(SendWalHandlerGuard {
|
||||
tx: tx_clone,
|
||||
tenant_id,
|
||||
timelineid,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
loop {
|
||||
let reply = swh
|
||||
@@ -155,3 +139,22 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SendWalHandlerGuard {
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
tenant_id: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl Drop for SendWalHandlerGuard {
|
||||
fn drop(&mut self) {
|
||||
self.tx
|
||||
.send(CallmeEvent::Unsubscribe(self.tenant_id, self.timelineid))
|
||||
.unwrap_or_else(|e| {
|
||||
error!(
|
||||
"failed to send Unsubscribe request to callmemaybe thread {}",
|
||||
e
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::{ReplicaState, Timeline, TimelineTools};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::{
|
||||
get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE, PG_TLI,
|
||||
};
|
||||
@@ -20,12 +19,17 @@ use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
use std::{str, thread};
|
||||
use tracing::*;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use zenith_utils::sock_split::ReadStream;
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
|
||||
|
||||
// See: https://www.postgresql.org/docs/13/protocol-replication.html
|
||||
@@ -81,6 +85,31 @@ impl Drop for ReplicationConnGuard {
|
||||
}
|
||||
}
|
||||
|
||||
// XXX: Naming is a bit messy here.
|
||||
// This ReplicationStreamGuard lives as long as ReplicationConn
|
||||
// and current ReplicationConnGuard is tied to the background thread
|
||||
// that receives feedback.
|
||||
struct ReplicationStreamGuard {
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
tenant_id: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl Drop for ReplicationStreamGuard {
|
||||
fn drop(&mut self) {
|
||||
// the connection with pageserver is lost,
|
||||
// resume callback subscription
|
||||
info!("Connection to pageserver is gone. Subscribe to callmemeybe again. tenantid {} timelineid {}",
|
||||
self.tenant_id, self.timelineid);
|
||||
|
||||
self.tx
|
||||
.send(CallmeEvent::Resume(self.tenant_id, self.timelineid))
|
||||
.unwrap_or_else(|e| {
|
||||
error!("failed to send Resume request to callmemaybe thread {}", e);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl ReplicationConn {
|
||||
/// Create a new `ReplicationConn`
|
||||
pub fn new(pgb: &mut PostgresBackend) -> Self {
|
||||
@@ -177,6 +206,8 @@ impl ReplicationConn {
|
||||
pgb: &mut PostgresBackend,
|
||||
cmd: &Bytes,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("WAL sender", timeline = %swh.timelineid.unwrap()).entered();
|
||||
|
||||
// spawn the background thread which receives HotStandbyFeedback messages.
|
||||
let bg_timeline = Arc::clone(swh.timeline.get());
|
||||
let bg_stream_in = self.stream_in.take().unwrap();
|
||||
@@ -221,6 +252,30 @@ impl ReplicationConn {
|
||||
};
|
||||
info!("Start replication from {:?} till {:?}", start_pos, stop_pos);
|
||||
|
||||
// Don't spam pageserver with callmemaybe queries
|
||||
// when replication connection with pageserver is already established.
|
||||
let _guard = {
|
||||
if swh.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
None
|
||||
} else {
|
||||
let timelineid = swh.timeline.get().timelineid;
|
||||
let tenant_id = swh.tenantid.unwrap();
|
||||
let tx_clone = swh.tx.clone();
|
||||
swh.tx
|
||||
.send(CallmeEvent::Pause(tenant_id, timelineid))
|
||||
.unwrap_or_else(|e| {
|
||||
error!("failed to send Pause request to callmemaybe thread {}", e);
|
||||
});
|
||||
|
||||
// create a guard to subscribe callback again, when this connection will exit
|
||||
Some(ReplicationStreamGuard {
|
||||
tx: tx_clone,
|
||||
tenant_id,
|
||||
timelineid,
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
// switch to copy
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
use anyhow::Result;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use s3::bucket::Bucket;
|
||||
use s3::creds::Credentials;
|
||||
@@ -16,6 +15,7 @@ use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
use tokio::runtime;
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
@@ -8,13 +8,13 @@ use bytes::Buf;
|
||||
use bytes::BufMut;
|
||||
use bytes::Bytes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
use pageserver::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::xlog_utils::TimeLineID;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io::Read;
|
||||
use tracing::*;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
@@ -30,7 +30,7 @@ use zenith_utils::pq_proto::SystemId;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 1;
|
||||
pub const SK_FORMAT_VERSION: u32 = 2;
|
||||
const SK_PROTOCOL_VERSION: u32 = 1;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
@@ -102,7 +102,7 @@ impl fmt::Debug for TermHistory {
|
||||
}
|
||||
|
||||
/// Unique id of proposer. Not needed for correctness, used for monitoring.
|
||||
type PgUuid = [u8; 16];
|
||||
pub type PgUuid = [u8; 16];
|
||||
|
||||
/// Persistent consensus state of the acceptor.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -140,12 +140,9 @@ pub struct ServerInfo {
|
||||
}
|
||||
|
||||
/// Persistent information stored on safekeeper node
|
||||
/// On disk data is prefixed by magic and format version and followed by checksum.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SafeKeeperState {
|
||||
/// magic for verifying content the control file
|
||||
pub magic: u32,
|
||||
/// safekeeper format version
|
||||
pub format_version: u32,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
@@ -166,8 +163,6 @@ pub struct SafeKeeperState {
|
||||
impl SafeKeeperState {
|
||||
pub fn new() -> SafeKeeperState {
|
||||
SafeKeeperState {
|
||||
magic: SK_MAGIC,
|
||||
format_version: SK_FORMAT_VERSION,
|
||||
acceptor_state: AcceptorState {
|
||||
term: 0,
|
||||
term_history: TermHistory::empty(),
|
||||
@@ -414,8 +409,8 @@ impl AcceptorProposerMessage {
|
||||
}
|
||||
|
||||
pub trait Storage {
|
||||
/// Persist safekeeper state on disk, optionally syncing it.
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
|
||||
/// Persist safekeeper state on disk.
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk and sync it.
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
// Truncate WAL at specified LSN
|
||||
@@ -568,7 +563,7 @@ where
|
||||
self.s.server.ztli = msg.ztli;
|
||||
self.s.server.wal_seg_size = msg.wal_seg_size;
|
||||
self.storage
|
||||
.persist(&self.s, true)
|
||||
.persist(&self.s)
|
||||
.with_context(|| "failed to persist shared state")?;
|
||||
|
||||
self.metrics = SafeKeeperMetrics::new(self.s.server.ztli);
|
||||
@@ -598,7 +593,7 @@ where
|
||||
if self.s.acceptor_state.term < msg.term {
|
||||
self.s.acceptor_state.term = msg.term;
|
||||
// persist vote before sending it out
|
||||
self.storage.persist(&self.s, true)?;
|
||||
self.storage.persist(&self.s)?;
|
||||
resp.term = self.s.acceptor_state.term;
|
||||
resp.vote_given = true as u64;
|
||||
}
|
||||
@@ -610,7 +605,7 @@ where
|
||||
fn bump_if_higher(&mut self, term: Term) -> Result<()> {
|
||||
if self.s.acceptor_state.term < term {
|
||||
self.s.acceptor_state.term = term;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
self.storage.persist(&self.s)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -649,7 +644,7 @@ where
|
||||
self.flush_lsn = msg.start_streaming_at;
|
||||
// and now adopt term history from proposer
|
||||
self.s.acceptor_state.term_history = msg.term_history.clone();
|
||||
self.storage.persist(&self.s, true)?;
|
||||
self.storage.persist(&self.s)?;
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
|
||||
@@ -753,7 +748,10 @@ where
|
||||
self.s.commit_lsn = self.commit_lsn;
|
||||
self.s.truncate_lsn = self.truncate_lsn;
|
||||
}
|
||||
self.storage.persist(&self.s, sync_control_file)?;
|
||||
|
||||
if sync_control_file {
|
||||
self.storage.persist(&self.s)?;
|
||||
}
|
||||
|
||||
let resp = self.append_response();
|
||||
info!(
|
||||
@@ -778,7 +776,7 @@ mod tests {
|
||||
}
|
||||
|
||||
impl Storage for InMemoryStorage {
|
||||
fn persist(&mut self, s: &SafeKeeperState, _sync: bool) -> Result<()> {
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
self.persisted_state = s.clone();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -17,7 +17,9 @@ use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, FeStartupMessage, RowDescriptor, INT4_OID, TEXT_OID};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use crate::timeline::CreateControlFile;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
|
||||
/// Handler for streaming WAL from acceptor
|
||||
pub struct SendWalHandler {
|
||||
@@ -27,6 +29,8 @@ pub struct SendWalHandler {
|
||||
pub tenantid: Option<ZTenantId>,
|
||||
pub timelineid: Option<ZTimelineId>,
|
||||
pub timeline: Option<Arc<Timeline>>,
|
||||
//sender to communicate with callmemaybe thread
|
||||
pub tx: UnboundedSender<CallmeEvent>,
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for SendWalHandler {
|
||||
@@ -46,6 +50,7 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
if let Some(app_name) = sm.params.get("application_name") {
|
||||
self.appname = Some(app_name.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -75,7 +80,17 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
} else if query_string.starts_with(b"START_REPLICATION") {
|
||||
ReplicationConn::new(pgb).run(self, pgb, &query_string)?;
|
||||
} else if query_string.starts_with(b"START_WAL_PUSH") {
|
||||
ReceiveWalConn::new(pgb)
|
||||
// TODO: this repeats query decoding logic from page_service so it is probably
|
||||
// a good idea to refactor it in pgbackend and pass string to process query instead of bytes
|
||||
let decoded_query_string = match query_string.last() {
|
||||
Some(0) => std::str::from_utf8(&query_string[..query_string.len() - 1])?,
|
||||
_ => std::str::from_utf8(&query_string)?,
|
||||
};
|
||||
let pageserver_connstr = decoded_query_string
|
||||
.split_whitespace()
|
||||
.nth(1)
|
||||
.map(|s| s.to_owned());
|
||||
ReceiveWalConn::new(pgb, pageserver_connstr)
|
||||
.run(self)
|
||||
.with_context(|| "failed to run ReceiveWalConn")?;
|
||||
} else if query_string.starts_with(b"JSON_CTRL") {
|
||||
@@ -88,13 +103,14 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
}
|
||||
|
||||
impl SendWalHandler {
|
||||
pub fn new(conf: SafeKeeperConf) -> Self {
|
||||
pub fn new(conf: SafeKeeperConf, tx: UnboundedSender<CallmeEvent>) -> Self {
|
||||
SendWalHandler {
|
||||
conf,
|
||||
appname: None,
|
||||
tenantid: None,
|
||||
timelineid: None,
|
||||
timeline: None,
|
||||
tx,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
//! persistence and support for interaction between sending and receiving wal.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fs2::FileExt;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::{find_end_of_wal, PG_TLI};
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
@@ -13,6 +13,7 @@ use std::io::{Read, Seek, SeekFrom, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -23,6 +24,7 @@ use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
|
||||
Storage, SK_FORMAT_VERSION, SK_MAGIC,
|
||||
};
|
||||
use crate::upgrade::upgrade_control_file;
|
||||
use crate::SafeKeeperConf;
|
||||
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
||||
use std::convert::TryInto;
|
||||
@@ -34,7 +36,7 @@ const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
|
||||
// dedicated lockfile to prevent running several safekeepers on the same data
|
||||
const LOCK_FILE_NAME: &str = "safekeeper.lock";
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
|
||||
/// Replica status: host standby feedback + disk consistent lsn
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -83,20 +85,13 @@ pub enum CreateControlFile {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref PERSIST_SYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_sync_control_file_seconds",
|
||||
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_sync_control_file_seconds histogram vec");
|
||||
static ref PERSIST_NOSYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_nosync_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_nosync_control_file_seconds histogram vec");
|
||||
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec");
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
@@ -134,7 +129,7 @@ impl SharedState {
|
||||
timelineid: ZTimelineId,
|
||||
create: CreateControlFile,
|
||||
) -> Result<Self> {
|
||||
let (file_storage, state) = SharedState::load_from_control_file(conf, timelineid, create)
|
||||
let (file_storage, state) = FileStorage::load_from_control_file(conf, timelineid, create)
|
||||
.with_context(|| "failed to load from control file")?;
|
||||
let flush_lsn = if state.server.wal_seg_size != 0 {
|
||||
let wal_dir = conf.timeline_dir(&timelineid);
|
||||
@@ -155,111 +150,6 @@ impl SharedState {
|
||||
replicas: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Fetch and lock control file (prevent running more than one instance of safekeeper)
|
||||
/// If create=false and file doesn't exist, bails out.
|
||||
fn load_from_control_file(
|
||||
conf: &SafeKeeperConf,
|
||||
timelineid: ZTimelineId,
|
||||
create: CreateControlFile,
|
||||
) -> Result<(FileStorage, SafeKeeperState)> {
|
||||
let timeline_dir = conf.timeline_dir(&timelineid);
|
||||
|
||||
let control_file_path = timeline_dir.join(CONTROL_FILE_NAME);
|
||||
let lock_file_path = timeline_dir.join(LOCK_FILE_NAME);
|
||||
|
||||
info!(
|
||||
"loading control file {}, create={:?} lock file {:?}",
|
||||
control_file_path.display(),
|
||||
create,
|
||||
lock_file_path.display(),
|
||||
);
|
||||
|
||||
let lock_file = File::create(&lock_file_path).with_context(|| "failed to open lockfile")?;
|
||||
|
||||
// Lock file to prevent two or more active safekeepers
|
||||
lock_file.try_lock_exclusive().map_err(|e| {
|
||||
anyhow!(
|
||||
"control file {:?} is locked by some other process: {}",
|
||||
&control_file_path,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut control_file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(matches!(create, CreateControlFile::True))
|
||||
.open(&control_file_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to open control file at {}",
|
||||
control_file_path.display(),
|
||||
)
|
||||
})?;
|
||||
|
||||
// Empty file is legit on 'create', don't try to deser from it.
|
||||
let state = if control_file.metadata().unwrap().len() == 0 {
|
||||
if let CreateControlFile::False = create {
|
||||
bail!("control file is empty");
|
||||
}
|
||||
SafeKeeperState::new()
|
||||
} else {
|
||||
let mut buf = Vec::new();
|
||||
control_file
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "failed to read control file")?;
|
||||
|
||||
let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);
|
||||
|
||||
let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
|
||||
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);
|
||||
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
format!(
|
||||
"safe keeper state checksum mismatch expected {} got {}",
|
||||
expected_checksum, calculated_checksum
|
||||
)
|
||||
);
|
||||
|
||||
let state =
|
||||
SafeKeeperState::des(&buf[..buf.len() - CHECKSUM_SIZE]).with_context(|| {
|
||||
format!(
|
||||
"failed to deserialize safe keeper state from control file at {}",
|
||||
control_file_path.display(),
|
||||
)
|
||||
})?;
|
||||
|
||||
if state.magic != SK_MAGIC {
|
||||
bail!("bad control file magic: {}", state.magic);
|
||||
}
|
||||
if state.format_version != SK_FORMAT_VERSION {
|
||||
bail!(
|
||||
"Got incompatible format version, expected {}, got {}",
|
||||
SK_FORMAT_VERSION,
|
||||
state.format_version,
|
||||
);
|
||||
}
|
||||
state
|
||||
};
|
||||
|
||||
let timelineid_str = format!("{}", timelineid);
|
||||
|
||||
Ok((
|
||||
FileStorage {
|
||||
lock_file,
|
||||
timeline_dir,
|
||||
conf: conf.clone(),
|
||||
persist_sync_control_file_seconds: PERSIST_SYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
persist_nosync_control_file_seconds: PERSIST_NOSYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
},
|
||||
state,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Database instance (tenant)
|
||||
@@ -441,20 +331,123 @@ struct FileStorage {
|
||||
// save timeline dir to avoid reconstructing it every time
|
||||
timeline_dir: PathBuf,
|
||||
conf: SafeKeeperConf,
|
||||
persist_sync_control_file_seconds: Histogram,
|
||||
persist_nosync_control_file_seconds: Histogram,
|
||||
persist_control_file_seconds: Histogram,
|
||||
}
|
||||
|
||||
impl FileStorage {
|
||||
// Check the magic/version in the on-disk data and deserialize it, if possible.
|
||||
fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
|
||||
// Read the version independent part
|
||||
let magic = buf.read_u32::<LittleEndian>()?;
|
||||
if magic != SK_MAGIC {
|
||||
bail!(
|
||||
"bad control file magic: {:X}, expected {:X}",
|
||||
magic,
|
||||
SK_MAGIC
|
||||
);
|
||||
}
|
||||
let version = buf.read_u32::<LittleEndian>()?;
|
||||
if version == SK_FORMAT_VERSION {
|
||||
let res = SafeKeeperState::des(buf)?;
|
||||
return Ok(res);
|
||||
}
|
||||
// try to upgrade
|
||||
upgrade_control_file(buf, version)
|
||||
}
|
||||
|
||||
/// Fetch and lock control file (prevent running more than one instance of safekeeper)
|
||||
/// If create=false and file doesn't exist, bails out.
|
||||
fn load_from_control_file(
|
||||
conf: &SafeKeeperConf,
|
||||
timelineid: ZTimelineId,
|
||||
create: CreateControlFile,
|
||||
) -> Result<(FileStorage, SafeKeeperState)> {
|
||||
let timeline_dir = conf.timeline_dir(&timelineid);
|
||||
|
||||
let control_file_path = timeline_dir.join(CONTROL_FILE_NAME);
|
||||
let lock_file_path = timeline_dir.join(LOCK_FILE_NAME);
|
||||
|
||||
info!(
|
||||
"loading control file {}, create={:?} lock file {:?}",
|
||||
control_file_path.display(),
|
||||
create,
|
||||
lock_file_path.display(),
|
||||
);
|
||||
|
||||
let lock_file = File::create(&lock_file_path).with_context(|| "failed to open lockfile")?;
|
||||
|
||||
// Lock file to prevent two or more active safekeepers
|
||||
lock_file.try_lock_exclusive().map_err(|e| {
|
||||
anyhow!(
|
||||
"control file {:?} is locked by some other process: {}",
|
||||
&control_file_path,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut control_file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(matches!(create, CreateControlFile::True))
|
||||
.open(&control_file_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to open control file at {}",
|
||||
control_file_path.display(),
|
||||
)
|
||||
})?;
|
||||
|
||||
// Empty file is legit on 'create', don't try to deser from it.
|
||||
let state = if control_file.metadata().unwrap().len() == 0 {
|
||||
if let CreateControlFile::False = create {
|
||||
bail!("control file is empty");
|
||||
}
|
||||
SafeKeeperState::new()
|
||||
} else {
|
||||
let mut buf = Vec::new();
|
||||
control_file
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "failed to read control file")?;
|
||||
|
||||
let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);
|
||||
|
||||
let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
|
||||
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);
|
||||
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
format!(
|
||||
"safekeeper control file checksum mismatch: expected {} got {}",
|
||||
expected_checksum, calculated_checksum
|
||||
)
|
||||
);
|
||||
|
||||
FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context(
|
||||
|| format!("while reading control file {}", control_file_path.display(),),
|
||||
)?
|
||||
};
|
||||
|
||||
let timelineid_str = format!("{}", timelineid);
|
||||
|
||||
Ok((
|
||||
FileStorage {
|
||||
lock_file,
|
||||
timeline_dir,
|
||||
conf: conf.clone(),
|
||||
persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
},
|
||||
state,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl Storage for FileStorage {
|
||||
// persists state durably to underlying storage
|
||||
// for description see https://lwn.net/Articles/457667/
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()> {
|
||||
let _timer = if sync {
|
||||
&self.persist_sync_control_file_seconds
|
||||
} else {
|
||||
&self.persist_nosync_control_file_seconds
|
||||
}
|
||||
.start_timer();
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
let _timer = &self.persist_control_file_seconds.start_timer();
|
||||
|
||||
// write data to safekeeper.control.partial
|
||||
let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
|
||||
@@ -464,7 +457,11 @@ impl Storage for FileStorage {
|
||||
&control_partial_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut buf = s.ser().with_context(|| "failed to serialize state")?;
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
buf.write_u32::<LittleEndian>(SK_MAGIC)?;
|
||||
buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
|
||||
s.ser_into(&mut buf)?;
|
||||
|
||||
// calculate checksum before resize
|
||||
let checksum = crc32c::crc32c(&buf);
|
||||
buf.extend_from_slice(&checksum.to_le_bytes());
|
||||
@@ -476,36 +473,32 @@ impl Storage for FileStorage {
|
||||
)
|
||||
})?;
|
||||
|
||||
if sync {
|
||||
// fsync the file
|
||||
control_partial.sync_all().with_context(|| {
|
||||
format!(
|
||||
"failed to sync partial control file at {}",
|
||||
control_partial_path.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
// fsync the file
|
||||
control_partial.sync_all().with_context(|| {
|
||||
format!(
|
||||
"failed to sync partial control file at {}",
|
||||
control_partial_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
|
||||
|
||||
// rename should be atomic
|
||||
fs::rename(&control_partial_path, &control_path)?;
|
||||
if sync {
|
||||
// this sync is not required by any standard but postgres does this (see durable_rename)
|
||||
File::open(&control_path)
|
||||
.and_then(|f| f.sync_all())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to sync control file at: {}",
|
||||
&control_path.display()
|
||||
)
|
||||
})?;
|
||||
// this sync is not required by any standard but postgres does this (see durable_rename)
|
||||
File::open(&control_path)
|
||||
.and_then(|f| f.sync_all())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to sync control file at: {}",
|
||||
&control_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// fsync the directory (linux specific)
|
||||
File::open(&self.timeline_dir)
|
||||
.and_then(|f| f.sync_all())
|
||||
.with_context(|| "failed to sync control file directory")?;
|
||||
}
|
||||
// fsync the directory (linux specific)
|
||||
File::open(&self.timeline_dir)
|
||||
.and_then(|f| f.sync_all())
|
||||
.with_context(|| "failed to sync control file directory")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -682,7 +675,7 @@ mod test {
|
||||
use super::FileStorage;
|
||||
use crate::{
|
||||
safekeeper::{SafeKeeperState, Storage},
|
||||
timeline::{CreateControlFile, SharedState, CONTROL_FILE_NAME},
|
||||
timeline::{CreateControlFile, CONTROL_FILE_NAME},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
use anyhow::Result;
|
||||
@@ -704,7 +697,7 @@ mod test {
|
||||
) -> Result<(FileStorage, SafeKeeperState)> {
|
||||
fs::create_dir_all(&conf.timeline_dir(&timeline_id))
|
||||
.expect("failed to create timeline dir");
|
||||
SharedState::load_from_control_file(conf, timeline_id, create)
|
||||
FileStorage::load_from_control_file(conf, timeline_id, create)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -717,9 +710,7 @@ mod test {
|
||||
.expect("failed to read state");
|
||||
// change something
|
||||
state.wal_start_lsn = Lsn(42);
|
||||
storage
|
||||
.persist(&state, true)
|
||||
.expect("failed to persist state");
|
||||
storage.persist(&state).expect("failed to persist state");
|
||||
}
|
||||
|
||||
let (_, state) = load_from_control_file(&conf, timeline_id, CreateControlFile::False)
|
||||
@@ -737,9 +728,7 @@ mod test {
|
||||
.expect("failed to read state");
|
||||
// change something
|
||||
state.wal_start_lsn = Lsn(42);
|
||||
storage
|
||||
.persist(&state, true)
|
||||
.expect("failed to persist state");
|
||||
storage.persist(&state).expect("failed to persist state");
|
||||
}
|
||||
let control_path = conf.timeline_dir(&timeline_id).join(CONTROL_FILE_NAME);
|
||||
let mut data = fs::read(&control_path).unwrap();
|
||||
@@ -749,7 +738,7 @@ mod test {
|
||||
match load_from_control_file(&conf, timeline_id, CreateControlFile::False) {
|
||||
Err(err) => assert!(err
|
||||
.to_string()
|
||||
.contains("safe keeper state checksum mismatch")),
|
||||
.contains("safekeeper control file checksum mismatch")),
|
||||
Ok(_) => panic!("expected error"),
|
||||
}
|
||||
}
|
||||
|
||||
60
walkeeper/src/upgrade.rs
Normal file
60
walkeeper/src/upgrade.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use crate::safekeeper::{
|
||||
AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use zenith_utils::{bin_ser::LeSer, lsn::Lsn};
|
||||
|
||||
/// Persistent consensus state of the acceptor.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct AcceptorStateV1 {
|
||||
/// acceptor's last term it voted for (advanced in 1 phase)
|
||||
term: Term,
|
||||
/// acceptor's epoch (advanced, i.e. bumped to 'term' when VCL is reached).
|
||||
epoch: Term,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct SafeKeeperStateV1 {
|
||||
/// persistent acceptor state
|
||||
acceptor_state: AcceptorStateV1,
|
||||
/// information about server
|
||||
server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealed with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
proposer_uuid: PgUuid,
|
||||
/// part of WAL acknowledged by quorum and available locally
|
||||
commit_lsn: Lsn,
|
||||
/// minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone)
|
||||
truncate_lsn: Lsn,
|
||||
// Safekeeper starts receiving WAL from this LSN, zeros before it ought to
|
||||
// be skipped during decoding.
|
||||
wal_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?;
|
||||
let ac = AcceptorState {
|
||||
term: oldstate.acceptor_state.term,
|
||||
term_history: TermHistory(vec![TermSwitchEntry {
|
||||
term: oldstate.acceptor_state.epoch,
|
||||
lsn: Lsn(0),
|
||||
}]),
|
||||
};
|
||||
return Ok(SafeKeeperState {
|
||||
acceptor_state: ac,
|
||||
server: oldstate.server.clone(),
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
truncate_lsn: oldstate.truncate_lsn,
|
||||
wal_start_lsn: oldstate.wal_start_lsn,
|
||||
});
|
||||
}
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
@@ -3,26 +3,34 @@
|
||||
//! receive WAL from wal_proposer and send it to WAL receivers
|
||||
//!
|
||||
use anyhow::Result;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::net::{TcpListener, TcpStream};
|
||||
use std::thread;
|
||||
use tracing::*;
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::SafeKeeperConf;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
use zenith_utils::postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
|
||||
pub fn thread_main(
|
||||
conf: SafeKeeperConf,
|
||||
listener: TcpListener,
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
match listener.accept() {
|
||||
Ok((socket, peer_addr)) => {
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
|
||||
let tx_clone = tx.clone();
|
||||
let _ = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf) {
|
||||
if let Err(err) = handle_socket(socket, conf, tx_clone) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
})
|
||||
@@ -33,12 +41,26 @@ pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Get unique thread id (Rust internal), with ThreadId removed for shorter printing
|
||||
fn get_tid() -> u64 {
|
||||
let tids = format!("{:?}", thread::current().id());
|
||||
let r = Regex::new(r"ThreadId\((\d+)\)").unwrap();
|
||||
let caps = r.captures(&tids).unwrap();
|
||||
caps.get(1).unwrap().as_str().parse().unwrap()
|
||||
}
|
||||
|
||||
/// This is run by `thread_main` above, inside a background thread.
|
||||
///
|
||||
fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
|
||||
fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("", tid = ?get_tid()).entered();
|
||||
|
||||
socket.set_nodelay(true)?;
|
||||
|
||||
let mut conn_handler = SendWalHandler::new(conf);
|
||||
let mut conn_handler = SendWalHandler::new(conf, tx);
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?;
|
||||
// libpq replication protocol between safekeeper and replicas/pagers
|
||||
pgbackend.run(&mut conn_handler)?;
|
||||
|
||||
@@ -154,3 +154,84 @@ zid_newtype!(ZTimelineId);
|
||||
pub struct ZTenantId(ZId);
|
||||
|
||||
zid_newtype!(ZTenantId);
|
||||
|
||||
/// Serde routines for Option<T> (de)serialization, using `T:Display` representations for inner values.
|
||||
/// Useful for Option<ZTenantId> and Option<ZTimelineId> to get their hex representations into serialized string and deserialize them back.
|
||||
pub mod opt_display_serde {
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::{fmt::Display, str::FromStr};
|
||||
|
||||
pub fn serialize<S, Id>(id: &Option<Id>, ser: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
Id: Display,
|
||||
{
|
||||
id.as_ref().map(ToString::to_string).serialize(ser)
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D, Id>(des: D) -> Result<Option<Id>, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
Id: FromStr,
|
||||
<Id as FromStr>::Err: Display,
|
||||
{
|
||||
Ok(if let Some(s) = Option::<String>::deserialize(des)? {
|
||||
Some(Id::from_str(&s).map_err(de::Error::custom)?)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fmt::Display;
|
||||
|
||||
use super::*;
|
||||
use hex::FromHexError;
|
||||
use hex_literal::hex;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TestStruct<E: Display, T: FromStr<Err = E> + Display> {
|
||||
#[serde(with = "opt_display_serde")]
|
||||
field: Option<T>,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hex_serializations_tenant_id() {
|
||||
let original_struct = TestStruct {
|
||||
field: Some(ZTenantId::from_array(hex!(
|
||||
"11223344556677881122334455667788"
|
||||
))),
|
||||
};
|
||||
|
||||
let serialized_string = serde_json::to_string(&original_struct).unwrap();
|
||||
assert_eq!(
|
||||
serialized_string,
|
||||
r#"{"field":"11223344556677881122334455667788"}"#
|
||||
);
|
||||
|
||||
let deserialized_struct: TestStruct<FromHexError, ZTenantId> =
|
||||
serde_json::from_str(&serialized_string).unwrap();
|
||||
assert_eq!(original_struct, deserialized_struct);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hex_serializations_timeline_id() {
|
||||
let original_struct = TestStruct {
|
||||
field: Some(ZTimelineId::from_array(hex!(
|
||||
"AA223344556677881122334455667788"
|
||||
))),
|
||||
};
|
||||
|
||||
let serialized_string = serde_json::to_string(&original_struct).unwrap();
|
||||
assert_eq!(
|
||||
serialized_string,
|
||||
r#"{"field":"aa223344556677881122334455667788"}"#
|
||||
);
|
||||
|
||||
let deserialized_struct: TestStruct<FromHexError, ZTimelineId> =
|
||||
serde_json::from_str(&serialized_string).unwrap();
|
||||
assert_eq!(original_struct, deserialized_struct);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user