diff --git a/.circleci/ansible/.gitignore b/.circleci/ansible/.gitignore index 14a1c155ae..441d9a8b82 100644 --- a/.circleci/ansible/.gitignore +++ b/.circleci/ansible/.gitignore @@ -1,2 +1,4 @@ zenith_install.tar.gz .zenith_current_version +neon_install.tar.gz +.neon_current_version diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh index a4b4372d9f..c613213a75 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.circleci/ansible/get_binaries.sh @@ -7,7 +7,7 @@ RELEASE=${RELEASE:-false} # look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then echo "search latest relase tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 @@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then fi else echo "search latest dev tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 diff --git a/.circleci/config.yml b/.circleci/config.yml index 3397bcc7b7..85654b5d45 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -121,7 +121,7 @@ jobs: export RUSTC_WRAPPER=cachepot export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests + "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests cachepot -s - save_cache: @@ -579,13 +579,13 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://neondatabase.github.io/helm-charts + helm repo add neondatabase https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - + helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait deploy-release: docker: diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml new file mode 100644 index 0000000000..91422e754a --- /dev/null +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -0,0 +1,31 @@ +# Helm chart values for zenith-proxy. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-staging.local/management/api/v2" + domain: "*.cloud.stage.neon.tech" + +# -- Additional labels for zenith-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: staging + zenith_region: us-east-1 + zenith_region_slug: virginia + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.gitignore b/.gitignore index 2ecdaa2053..adb1b41503 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ test_output/ # Coverage *.profraw *.profdata + +*.key +*.crt diff --git a/Cargo.lock b/Cargo.lock index 7c6470bd0e..1949babd98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,6 +113,49 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "axum" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a" +dependencies = [ + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa 1.0.1", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", +] + [[package]] name = "backtrace" version = "0.3.64" @@ -323,6 +366,15 @@ dependencies = [ "textwrap 0.14.2", ] +[[package]] +name = "cmake" +version = "0.1.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" +dependencies = [ + "cc", +] + [[package]] name = "combine" version = "4.6.3" @@ -333,6 +385,18 @@ dependencies = [ "memchr", ] +[[package]] +name = "comfy-table" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +dependencies = [ + "crossterm", + "strum", + "strum_macros", + "unicode-width", +] + [[package]] name = "compute_tools" version = "0.1.0" @@ -529,6 +593,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossterm" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17" +dependencies = [ + "bitflags", + "crossterm_winapi", + "libc", + "mio", + "parking_lot 0.12.0", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +dependencies = [ + "winapi", +] + [[package]] name = "crypto-common" version = "0.1.3" @@ -696,9 +785,9 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.8.4" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118" +checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954" dependencies = [ "http", "prost", @@ -706,9 +795,26 @@ dependencies = [ "tokio-stream", "tonic", "tonic-build", + "tower", "tower-service", ] +[[package]] +name = "etcd_broker" +version = "0.1.0" +dependencies = [ + "etcd-client", + "regex", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "fail" version = "0.5.0" @@ -993,6 +1099,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1058,6 +1170,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + [[package]] name = "httparse" version = "1.6.0" @@ -1332,6 +1450,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" +[[package]] +name = "matchit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" + [[package]] name = "md-5" version = "0.9.1" @@ -1462,6 +1586,23 @@ dependencies = [ "tempfile", ] +[[package]] +name = "neon_local" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 3.0.14", + "comfy-table", + "control_plane", + "pageserver", + "postgres", + "postgres_ffi", + "safekeeper", + "serde_json", + "utils", + "workspace_hack", +] + [[package]] name = "nix" version = "0.23.1" @@ -1634,7 +1775,6 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "byteorder", "bytes", "chrono", @@ -1662,8 +1802,7 @@ dependencies = [ "pprof", "rand", "regex", - "rusoto_core", - "rusoto_s3", + "remote_storage", "scopeguard", "serde", "serde_json", @@ -1675,7 +1814,6 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-stream", - "tokio-util 0.7.0", "toml_edit", "tracing", "url", @@ -1935,6 +2073,16 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "prettyplease" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -1966,9 +2114,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.9.0" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f" dependencies = [ "bytes", "prost-derive", @@ -1976,12 +2124,14 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.9.0" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846" dependencies = [ "bytes", - "heck", + "cfg-if", + "cmake", + "heck 0.4.0", "itertools", "lazy_static", "log", @@ -1996,9 +2146,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc" dependencies = [ "anyhow", "itertools", @@ -2009,9 +2159,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68" dependencies = [ "bytes", "prost", @@ -2053,6 +2203,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", + "url", "utils", "workspace_hack", ] @@ -2182,9 +2333,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" dependencies = [ "aho-corasick", "memchr", @@ -2206,6 +2357,23 @@ version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +[[package]] +name = "remote_storage" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "rusoto_core", + "rusoto_s3", + "serde", + "serde_json", + "tempfile", + "tokio", + "tokio-util 0.7.0", + "tracing", + "workspace_hack", +] + [[package]] name = "remove_dir_all" version = "0.5.3" @@ -2305,9 +2473,9 @@ dependencies = [ [[package]] name = "rusoto_core" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc" +checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2" dependencies = [ "async-trait", "base64", @@ -2330,9 +2498,9 @@ dependencies = [ [[package]] name = "rusoto_credential" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f" +checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05" dependencies = [ "async-trait", "chrono", @@ -2348,9 +2516,9 @@ dependencies = [ [[package]] name = "rusoto_s3" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027" +checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d" dependencies = [ "async-trait", "bytes", @@ -2361,9 +2529,9 @@ dependencies = [ [[package]] name = "rusoto_signature" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc" +checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272" dependencies = [ "base64", "bytes", @@ -2459,7 +2627,7 @@ dependencies = [ "const_format", "crc32c", "daemonize", - "etcd-client", + "etcd_broker", "fs2", "hex", "humantime", @@ -2470,8 +2638,7 @@ dependencies = [ "postgres-protocol", "postgres_ffi", "regex", - "rusoto_core", - "rusoto_s3", + "remote_storage", "serde", "serde_json", "serde_with", @@ -2676,6 +2843,17 @@ dependencies = [ "signal-hook-registry", ] +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -2765,6 +2943,25 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" + +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck 0.3.3", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.4.1" @@ -2796,15 +2993,21 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52" dependencies = [ "proc-macro2", "quote", "unicode-xid", ] +[[package]] +name = "sync_wrapper" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" + [[package]] name = "tar" version = "0.4.38" @@ -3098,12 +3301,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.6.2" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" +checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb" dependencies = [ "async-stream", "async-trait", + "axum", "base64", "bytes", "futures-core", @@ -3119,7 +3323,7 @@ dependencies = [ "prost-derive", "tokio", "tokio-stream", - "tokio-util 0.6.9", + "tokio-util 0.7.0", "tower", "tower-layer", "tower-service", @@ -3129,10 +3333,11 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.6.2" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" +checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1" dependencies = [ + "prettyplease", "proc-macro2", "prost-build", "quote", @@ -3159,6 +3364,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.1" @@ -3600,13 +3824,22 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "bytes", "chrono", "clap 2.34.0", "either", + "fail", + "futures-channel", + "futures-task", + "futures-util", + "generic-array", "hashbrown", + "hex", + "hyper", "indexmap", + "itoa 0.4.8", "libc", "log", "memchr", @@ -3620,6 +3853,7 @@ dependencies = [ "serde", "syn", "tokio", + "tokio-util 0.7.0", "tracing", "tracing-core", ] @@ -3648,22 +3882,6 @@ dependencies = [ "chrono", ] -[[package]] -name = "zenith" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.0.14", - "control_plane", - "pageserver", - "postgres", - "postgres_ffi", - "safekeeper", - "serde_json", - "utils", - "workspace_hack", -] - [[package]] name = "zeroize" version = "1.5.2" diff --git a/Cargo.toml b/Cargo.toml index 3838637d37..f0934853f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ "proxy", "safekeeper", "workspace_hack", - "zenith", + "neon_local", "libs/*", ] diff --git a/README.md b/README.md index 03f86887a7..af384d2672 100644 --- a/README.md +++ b/README.md @@ -49,32 +49,30 @@ make -j5 ```sh # Create repository in .zenith with proper paths to binaries and data # Later that would be responsibility of a package install script -> ./target/debug/zenith init -initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229 -created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8 -created main branch +> ./target/debug/neon_local init +initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c +created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50 +initial timeline de200bd42b49cc1814412c7e592dd6e9 created pageserver init succeeded # start pageserver and safekeeper -> ./target/debug/zenith start -Starting pageserver at 'localhost:64000' in '.zenith' +> ./target/debug/neon_local start +Starting pageserver at '127.0.0.1:64000' in '.zenith' Pageserver started -initializing for single for 7676 -Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' +initializing for sk 1 for 7676 +Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1' Safekeeper started # start postgres compute node -> ./target/debug/zenith pg start main -Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 +> ./target/debug/neon_local pg start main +Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' -waiting for server to start.... done -server started # check list of running postgres instances -> ./target/debug/zenith pg list -NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS -main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running +> ./target/debug/neon_local pg list + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` 4. Now it is possible to connect to postgres and run some queries: @@ -94,18 +92,25 @@ postgres=# select * from t; 5. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith timeline branch --branch-name migration_check -Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' +> ./target/debug/neon_local timeline branch --branch-name migration_check +Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith timeline list - main [5b014a9e41b4b63ce1a1febc04503636] - ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] +> ./target/debug/neon_local timeline list +(L) main [de200bd42b49cc1814412c7e592dd6e9] +(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/zenith pg start migration_check -Starting postgres node at 'host=127.0.0.1 port=55433 user=stas' -waiting for server to start.... done +> ./target/debug/neon_local pg start migration_check --branch-name migration_check +Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 +Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres' + +# check the new list of running postgres instances +> ./target/debug/neon_local pg list + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running + migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres @@ -118,12 +123,20 @@ postgres=# select * from t; postgres=# insert into t values(2,2); INSERT 0 1 + +# check that the new change doesn't affect the 'main' postgres +> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +postgres=# select * from t; + key | value +-----+------- + 1 | 1 +(1 row) ``` 6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances you have just started. You can stop them all with one command: ```sh -> ./target/debug/zenith stop +> ./target/debug/neon_local stop ``` ## Running tests diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 12ee88cdc9..5aeff505b6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -63,6 +63,10 @@ pub struct LocalEnv { #[serde(default)] pub broker_endpoints: Option, + /// A prefix to all to any key when pushing/polling etcd from a node. + #[serde(default)] + pub broker_etcd_prefix: Option, + pub pageserver: PageServerConf, #[serde(default)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index b094016131..074ee72f69 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -77,6 +77,7 @@ pub struct SafekeeperNode { pub pageserver: Arc, broker_endpoints: Option, + broker_etcd_prefix: Option, } impl SafekeeperNode { @@ -94,6 +95,7 @@ impl SafekeeperNode { http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), pageserver, broker_endpoints: env.broker_endpoints.clone(), + broker_etcd_prefix: env.broker_etcd_prefix.clone(), } } @@ -143,6 +145,9 @@ impl SafekeeperNode { if let Some(ref ep) = self.broker_endpoints { cmd.args(&["--broker-endpoints", ep]); } + if let Some(prefix) = self.broker_etcd_prefix.as_deref() { + cmd.args(&["--broker-etcd-prefix", prefix]); + } if !cmd.status()?.success() { bail!( diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3a63bf6960..d2e63a22de 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -167,6 +167,9 @@ impl PageServerNode { ); } + // echo the captured output of the init command + println!("{}", String::from_utf8_lossy(&init_output.stdout)); + Ok(initial_timeline_id) } @@ -186,8 +189,6 @@ impl PageServerNode { ); io::stdout().flush().unwrap(); - let mut cmd = Command::new(self.env.pageserver_bin()?); - let repo_path = self.repo_path(); let mut args = vec!["-D", repo_path.to_str().unwrap()]; @@ -195,9 +196,11 @@ impl PageServerNode { args.extend(["-c", config_override]); } - fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); + let mut cmd = Command::new(self.env.pageserver_bin()?); + let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); + filled_cmd = fill_aws_secrets_vars(filled_cmd); - if !cmd.status()?.success() { + if !filled_cmd.status()?.success() { bail!( "Pageserver failed to start. See '{}' for details.", self.repo_path().join("pageserver.log").display() @@ -457,3 +460,12 @@ impl PageServerNode { Ok(timeline_info_response) } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/docs/rfcs/016-connection-routing.md b/docs/rfcs/016-connection-routing.md new file mode 100644 index 0000000000..603a0725d6 --- /dev/null +++ b/docs/rfcs/016-connection-routing.md @@ -0,0 +1,151 @@ +# Dispatching a connection + +For each client connection, Neon service needs to authenticate the +connection, and route it to the right PostgreSQL instance. + +## Authentication + +There are three different ways to authenticate: + +- anonymous; no authentication needed +- PostgreSQL authentication +- github single sign-on using browser + +In anonymous access, the user doesn't need to perform any +authentication at all. This can be used e.g. in interactive PostgreSQL +documentation, allowing you to run the examples very quickly. Similar +to sqlfiddle.com. + +PostgreSQL authentication works the same as always. All the different +PostgreSQL authentication options like SCRAM, kerberos, etc. are +available. [1] + +The third option is to authenticate with github single sign-on. When +you open the connection in psql, you get a link that you open with +your browser. Opening the link redirects you to github authentication, +and lets the connection to proceed. This is also known as "Link auth" [2]. + + +## Routing the connection + +When a client starts a connection, it needs to be routed to the +correct PostgreSQL instance. Routing can be done by the proxy, acting +as a man-in-the-middle, or the connection can be routed at the network +level based on the hostname or IP address. + +Either way, Neon needs to identify which PostgreSQL instance the +connection should be routed to. If the instance is not already +running, it needs to be started. Some connections always require a new +PostgreSQL instance to be created, e.g. if you want to run a one-off +query against a particular point-in-time. + +The PostgreSQL instance is identified by: +- Neon account (possibly anonymous) +- cluster (known as tenant in the storage?) +- branch or snapshot name +- timestamp (PITR) +- primary or read-replica +- one-off read replica +- one-off writeable branch + +When you are using regular PostgreSQL authentication or anonymous +access, the connection URL needs to contain all the information needed +for the routing. With github single sign-on, the browser is involved +and some details - the Neon account in particular - can be deduced +from the authentication exchange. + +There are three methods for identifying the PostgreSQL instance: + +- Browser interaction (link auth) +- Options in the connection URL and the domain name +- A pre-defined endpoint, identified by domain name or IP address + +### Link Auth + + postgres://@start.neon.tech/ + +This gives you a link that you open in browser. Clicking the link +performs github authentication, and the Neon account name is +provided to the proxy behind the scenes. The proxy routes the +connection to the primary PostgreSQL instance in cluster called +"main", branch "main". + +Further ideas: +- You could pre-define a different target for link auth + connections in the UI. +- You could have a drop-down in the browser, allowing you to connect + to any cluster you want. Link Auth can be like Teleport. + +### Connection URL + +The connection URL looks like this: + + postgres://@.db.neon.tech/ + +By default, this connects you to the primary PostgreSQL instance +running on the "main" branch in the named cluster [3]. However, you can +change that by specifying options in the connection URL. The following +options are supported: + +| option name | Description | Examples | +| --- | --- | --- | +| cluster | Cluster name | cluster:myproject | +| branch | Branch name | branch:main | +| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z | +| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 | +| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new | + +For example, to read branch 'testing' as it was on Mar 31, 2022, you could +specify a timestamp in the connection URL [4]: + + postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31 + +Connecting with cluster name and options can be disabled in the UI. If +disabled, you can only connect using a pre-defined endpoint. + +### Pre-defined Endpoint + +Instead of providing the cluster name, branch, and all those options +in the connection URL, you can define a named endpoint with the same +options. + +In the UI, click "create endpoint". Fill in the details: + +- Cluster name +- Branch +- timestamp or LSN +- is this for the primary or for a read replica +- etc. + +When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect: + + postgres://@.endpoint.neon.tech/ + + +An endpoint can be assigned a static or dynamic IP address, so that +you can connect to it with clients that don't support TLS SNI. Maybe +bypass the proxy altogether, but that ought to be invisible to the +user. + +You can limit the range of source IP addresses that are allowed to +connect to an endpoint. An endpoint can also be exposed in an Amazon +VPC, allowing direct connections from applications. + + +# Footnotes + +[1] I'm not sure how feasible it is to set up configure like Kerberos +or LDAP in a cloud environment. But in principle I think we should +allow customers to have the full power of PostgreSQL, including all +authentication options. However, it's up to the customer to configure +it correctly. + +[2] Link is a way to both authenticate and to route the connection + +[3] This assumes that cluster-ids are globally unique, across all +Neon accounts. + +[4] The syntax accepted in the connection URL is limited by libpq. The +only way to pass arbitrary options to the server (or our proxy) is +with the "options" keyword, and the options must be percent-encoded. I +think the above would work but i haven't tested it diff --git a/docs/settings.md b/docs/settings.md index b3925528cd..017d349bb6 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override the values in the config file, if any are specified for the same key and get into the final config during init phase. - ### Config example ```toml @@ -35,9 +34,9 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` -* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` +- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` ### Config values @@ -57,7 +56,7 @@ but it will trigger a checkpoint operation to get it back below the limit. `checkpoint_distance` also determines how much WAL needs to be kept -durable in the safekeeper. The safekeeper must have capacity to hold +durable in the safekeeper. The safekeeper must have capacity to hold this much WAL, with some headroom, otherwise you can get stuck in a situation where the safekeeper is full and stops accepting new WAL, but the pageserver is not flushing out and releasing the space in the @@ -72,7 +71,7 @@ The unit is # of bytes. Every `compaction_period` seconds, the page server checks if maintenance operations, like compaction, are needed on the layer -files. Default is 1 s, which should be fine. +files. Default is 1 s, which should be fine. #### compaction_target_size @@ -163,16 +162,12 @@ bucket_region = 'eu-north-1' # Optional, pageserver uses entire bucket if the prefix is not specified. prefix_in_bucket = '/some/prefix/' -# Access key to connect to the bucket ("login" part of the credentials) -access_key_id = 'SOMEKEYAAAAASADSAH*#' - -# Secret access key to connect to the bucket ("password" part of the credentials) -secret_access_key = 'SOMEsEcReTsd292v' - # S3 API query limit to avoid getting errors/throttling from AWS. concurrency_limit = 100 ``` +If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials. + ###### General remote storage configuration Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. @@ -183,13 +178,12 @@ Besides, there are parameters common for all types of remote storage that can be ```toml [remote_storage] # Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time. -max_concurrent_timelines_sync = 50 +max_concurrent_syncs = 50 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore. max_sync_errors = 10 ``` - ## safekeeper TODO diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml new file mode 100644 index 0000000000..65bd406131 --- /dev/null +++ b/libs/etcd_broker/Cargo.toml @@ -0,0 +1,17 @@ +[package] + name = "etcd_broker" + version = "0.1.0" + edition = "2021" + + [dependencies] + etcd-client = "0.9.0" + regex = "1.4.5" + serde = { version = "1.0", features = ["derive"] } + serde_json = "1" + serde_with = "1.12.0" + + utils = { path = "../utils" } + workspace_hack = { version = "0.1", path = "../../workspace_hack" } + tokio = "1" + tracing = "0.1" + thiserror = "1" diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs new file mode 100644 index 0000000000..01cc0cf162 --- /dev/null +++ b/libs/etcd_broker/src/lib.rs @@ -0,0 +1,335 @@ +//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). +//! Intended to connect services to each other, not to store their data. +use std::{ + collections::{hash_map, HashMap}, + fmt::Display, + str::FromStr, +}; + +use regex::{Captures, Regex}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; + +pub use etcd_client::*; + +use tokio::{sync::mpsc, task::JoinHandle}; +use tracing::*; +use utils::{ + lsn::Lsn, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, +}; + +#[derive(Debug, Deserialize, Serialize)] +struct SafekeeperTimeline { + safekeeper_id: ZNodeId, + info: SkTimelineInfo, +} + +/// Published data about safekeeper's timeline. Fields made optional for easy migrations. +#[serde_as] +#[derive(Debug, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + #[serde(default)] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + #[serde(default)] + pub commit_lsn: Option, + /// LSN up to which safekeeper offloaded WAL to s3. + #[serde_as(as = "Option")] + #[serde(default)] + pub s3_wal_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + #[serde(default)] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub peer_horizon_lsn: Option, + #[serde(default)] + pub wal_stream_connection_string: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum BrokerError { + #[error("Etcd client error: {0}. Context: {1}")] + EtcdClient(etcd_client::Error, String), + #[error("Error during parsing etcd data: {0}")] + ParsingError(String), + #[error("Internal error: {0}")] + InternalError(String), +} + +/// A way to control the data retrieval from a certain subscription. +pub struct SkTimelineSubscription { + safekeeper_timeline_updates: + mpsc::UnboundedReceiver>>, + kind: SkTimelineSubscriptionKind, + watcher_handle: JoinHandle>, + watcher: Watcher, +} + +impl SkTimelineSubscription { + /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. + pub async fn fetch_data( + &mut self, + ) -> Option>> { + self.safekeeper_timeline_updates.recv().await + } + + /// Cancels the subscription, stopping the data poller and waiting for it to shut down. + pub async fn cancel(mut self) -> Result<(), BrokerError> { + self.watcher.cancel().await.map_err(|e| { + BrokerError::EtcdClient( + e, + format!( + "Failed to cancel timeline subscription, kind: {:?}", + self.kind + ), + ) + })?; + self.watcher_handle.await.map_err(|e| { + BrokerError::InternalError(format!( + "Failed to join the timeline updates task, kind: {:?}, error: {e}", + self.kind + )) + })? + } +} + +/// The subscription kind to the timeline updates from safekeeper. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SkTimelineSubscriptionKind { + broker_prefix: String, + kind: SubscriptionKind, +} + +impl SkTimelineSubscriptionKind { + pub fn all(broker_prefix: String) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::All, + } + } + + pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::Tenant(tenant), + } + } + + pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::Timeline(timeline), + } + } + + fn watch_regex(&self) -> Regex { + match self.kind { + SubscriptionKind::All => Regex::new(&format!( + r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'everything' subscription"), + SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( + r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'tenant' subscription"), + SubscriptionKind::Timeline(ZTenantTimelineId { + tenant_id, + timeline_id, + }) => Regex::new(&format!( + r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'timeline' subscription"), + } + } + + /// Etcd key to use for watching a certain timeline updates from safekeepers. + pub fn watch_key(&self) -> String { + match self.kind { + SubscriptionKind::All => self.broker_prefix.to_string(), + SubscriptionKind::Tenant(tenant_id) => { + format!("{}/{tenant_id}/safekeeper", self.broker_prefix) + } + SubscriptionKind::Timeline(ZTenantTimelineId { + tenant_id, + timeline_id, + }) => format!( + "{}/{tenant_id}/{timeline_id}/safekeeper", + self.broker_prefix + ), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum SubscriptionKind { + /// Get every timeline update. + All, + /// Get certain tenant timelines' updates. + Tenant(ZTenantId), + /// Get certain timeline updates. + Timeline(ZTenantTimelineId), +} + +/// Creates a background task to poll etcd for timeline updates from safekeepers. +/// Stops and returns `Err` on any error during etcd communication. +/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, +/// exiting normally in such cases. +pub async fn subscribe_to_safekeeper_timeline_updates( + client: &mut Client, + subscription: SkTimelineSubscriptionKind, +) -> Result { + info!("Subscribing to timeline updates, subscription kind: {subscription:?}"); + + let (watcher, mut stream) = client + .watch( + subscription.watch_key(), + Some(WatchOptions::new().with_prefix()), + ) + .await + .map_err(|e| { + BrokerError::EtcdClient( + e, + format!("Failed to init the watch for subscription {subscription:?}"), + ) + })?; + + let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel(); + + let subscription_kind = subscription.kind; + let regex = subscription.watch_regex(); + let watcher_handle = tokio::spawn(async move { + while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( + "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}" + )))? { + if resp.canceled() { + info!("Watch for timeline updates subscription was canceled, exiting"); + break; + } + + let mut timeline_updates: HashMap> = + HashMap::new(); + + let events = resp.events(); + debug!("Processing {} events", events.len()); + + for event in events { + if EventType::Put == event.event_type() { + if let Some(kv) = event.kv() { + match parse_etcd_key_value(subscription_kind, ®ex, kv) { + Ok(Some((zttid, timeline))) => { + match timeline_updates + .entry(zttid) + .or_default() + .entry(timeline.safekeeper_id) + { + hash_map::Entry::Occupied(mut o) => { + if o.get().flush_lsn < timeline.info.flush_lsn { + o.insert(timeline.info); + } + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline.info); + } + } + } + Ok(None) => {} + Err(e) => error!("Failed to parse timeline update: {e}"), + }; + } + } + } + + if let Err(e) = timeline_updates_sender.send(timeline_updates) { + info!("Timeline updates sender got dropped, exiting: {e}"); + break; + } + } + + Ok(()) + }); + + Ok(SkTimelineSubscription { + kind: subscription, + safekeeper_timeline_updates, + watcher_handle, + watcher, + }) +} + +fn parse_etcd_key_value( + subscription_kind: SubscriptionKind, + regex: &Regex, + kv: &KeyValue, +) -> Result, BrokerError> { + let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| { + BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str")) + })?) { + caps + } else { + return Ok(None); + }; + + let (zttid, safekeeper_id) = match subscription_kind { + SubscriptionKind::All => ( + ZTenantTimelineId::new( + parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, + parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, + ), + ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + ), + SubscriptionKind::Tenant(tenant_id) => ( + ZTenantTimelineId::new( + tenant_id, + parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, + ), + ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + ), + SubscriptionKind::Timeline(zttid) => ( + zttid, + ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + ), + }; + + let info_str = kv.value_str().map_err(|e| { + BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str")) + })?; + Ok(Some(( + zttid, + SafekeeperTimeline { + safekeeper_id, + info: serde_json::from_str(info_str).map_err(|e| { + BrokerError::ParsingError(format!( + "Failed to parse '{info_str}' as safekeeper timeline info: {e}" + )) + })?, + }, + ))) +} + +fn parse_capture(caps: &Captures, index: usize) -> Result +where + T: FromStr, + ::Err: Display, +{ + let capture_match = caps + .get(index) + .ok_or_else(|| format!("Failed to get capture match at index {index}"))? + .as_str(); + capture_match.parse().map_err(|e| { + format!( + "Failed to parse {} from {capture_match}: {e}", + std::any::type_name::() + ) + }) +} diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 9d1089ed46..95ea9660e8 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -89,7 +89,12 @@ impl WalStreamDecoder { return Ok(None); } - let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf); + let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("long header deserialization failed {}", e), + lsn: self.lsn, + } + })?; if hdr.std.xlp_pageaddr != self.lsn.0 { return Err(WalDecodeError { @@ -106,7 +111,12 @@ impl WalStreamDecoder { return Ok(None); } - let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf); + let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("header deserialization failed {}", e), + lsn: self.lsn, + } + })?; if hdr.xlp_pageaddr != self.lsn.0 { return Err(WalDecodeError { @@ -188,7 +198,13 @@ impl WalStreamDecoder { } // We now have a record in the 'recordbuf' local variable. - let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]); + let xlogrec = + XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { + WalDecodeError { + msg: format!("xlog record deserialization failed {}", e), + lsn: self.lsn, + } + })?; let mut crc = 0; crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 1645c44de5..7882058868 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,7 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; -use anyhow::{bail, Result}; +use anyhow::bail; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -28,6 +28,8 @@ use std::io::prelude::*; use std::io::SeekFrom; use std::path::{Path, PathBuf}; use std::time::SystemTime; +use utils::bin_ser::DeserializeError; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; @@ -118,11 +120,15 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } pub fn get_current_timestamp() -> TimestampTz { + to_pg_timestamp(SystemTime::now()) +} + +pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz { const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ const SECS_PER_DAY: u64 = 86400; const USECS_PER_SEC: u64 = 1000000; - match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { + match time.duration_since(SystemTime::UNIX_EPOCH) { Ok(n) => { ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) * USECS_PER_SEC @@ -140,7 +146,7 @@ fn find_end_of_wal_segment( tli: TimeLineID, wal_seg_size: usize, start_offset: usize, // start reading at this point -) -> Result { +) -> anyhow::Result { // step back to the beginning of the page to read it in... let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; let mut contlen: usize = 0; @@ -268,7 +274,7 @@ pub fn find_end_of_wal( wal_seg_size: usize, precise: bool, start_lsn: Lsn, // start reading WAL at this point or later -) -> Result<(XLogRecPtr, TimeLineID)> { +) -> anyhow::Result<(XLogRecPtr, TimeLineID)> { let mut high_segno: XLogSegNo = 0; let mut high_tli: TimeLineID = 0; let mut high_ispartial = false; @@ -350,19 +356,19 @@ pub fn main() { } impl XLogRecord { - pub fn from_slice(buf: &[u8]) -> XLogRecord { + pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; - XLogRecord::des(buf).unwrap() + XLogRecord::des(buf) } - pub fn from_bytes(buf: &mut B) -> XLogRecord { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogRecord::des_from(&mut buf.reader()).unwrap() + XLogRecord::des_from(&mut buf.reader()) } - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + Ok(self.ser()?.into()) } // Is this record an XLOG_SWITCH record? They need some special processing, @@ -372,35 +378,35 @@ impl XLogRecord { } impl XLogPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogPageHeaderData::des_from(&mut buf.reader()).unwrap() + XLogPageHeaderData::des_from(&mut buf.reader()) } } impl XLogLongPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogLongPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap() + XLogLongPageHeaderData::des_from(&mut buf.reader()) } - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + self.ser().map(|b| b.into()) } } pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); impl CheckPoint { - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + Ok(self.ser()?.into()) } - pub fn decode(buf: &[u8]) -> Result { + pub fn decode(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; - Ok(CheckPoint::des(buf)?) + CheckPoint::des(buf) } /// Update next XID based on provided new_xid and stored epoch. @@ -438,7 +444,7 @@ impl CheckPoint { // Generate new, empty WAL segment. // We need this segment to start compute node. // -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { +pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); @@ -458,12 +464,12 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { xlp_xlog_blcksz: XLOG_BLCKSZ as u32, }; - let hdr_bytes = hdr.encode(); + let hdr_bytes = hdr.encode()?; seg_buf.extend_from_slice(&hdr_bytes); //zero out the rest of the file seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0); - seg_buf.freeze() + Ok(seg_buf.freeze()) } #[cfg(test)] diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml new file mode 100644 index 0000000000..291f6e50ac --- /dev/null +++ b/libs/remote_storage/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "remote_storage" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { version = "1.0", features = ["backtrace"] } +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +tracing = "0.1.27" +rusoto_core = "0.48" +rusoto_s3 = "0.48" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +async-trait = "0.1" + +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] +tempfile = "3.2" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs new file mode 100644 index 0000000000..9bbb855dd5 --- /dev/null +++ b/libs/remote_storage/src/lib.rs @@ -0,0 +1,232 @@ +//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. +//! No other modules from this tree are supposed to be used directly by the external code. +//! +//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: +//! * [`local_fs`] allows to use local file system as an external storage +//! * [`s3_bucket`] uses AWS S3 bucket as an external storage +//! +mod local_fs; +mod s3_bucket; + +use std::{ + borrow::Cow, + collections::HashMap, + ffi::OsStr, + num::{NonZeroU32, NonZeroUsize}, + path::{Path, PathBuf}, +}; + +use anyhow::Context; +use tokio::io; +use tracing::info; + +pub use self::{ + local_fs::LocalFs, + s3_bucket::{S3Bucket, S3ObjectKey}, +}; + +/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. +/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency +/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. +/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. +pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50; +pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; +/// Currently, sync happens with AWS S3, that has two limits on requests per second: +/// ~200 RPS for IAM services +/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html +/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests +/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ +pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; + +/// Storage (potentially remote) API to manage its state. +/// This storage tries to be unaware of any layered repository context, +/// providing basic CRUD operations for storage files. +#[async_trait::async_trait] +pub trait RemoteStorage: Send + Sync { + /// A way to uniquely reference a file in the remote storage. + type RemoteObjectId; + + /// Attempts to derive the storage path out of the local path, if the latter is correct. + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; + + /// Gets the download path of the given storage file. + fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result; + + /// Lists all items the storage has right now. + async fn list(&self) -> anyhow::Result>; + + /// Streams the local file contents into remote into the remote storage entry. + async fn upload( + &self, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + from_size_bytes: usize, + to: &Self::RemoteObjectId, + metadata: Option, + ) -> anyhow::Result<()>; + + /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download( + &self, + from: &Self::RemoteObjectId, + to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), + ) -> anyhow::Result>; + + /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download_byte_range( + &self, + from: &Self::RemoteObjectId, + start_inclusive: u64, + end_exclusive: Option, + to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), + ) -> anyhow::Result>; + + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; +} + +/// TODO kb +pub enum GenericRemoteStorage { + Local(LocalFs), + S3(S3Bucket), +} + +impl GenericRemoteStorage { + pub fn new( + working_directory: PathBuf, + storage_config: &RemoteStorageConfig, + ) -> anyhow::Result { + match &storage_config.storage { + RemoteStorageKind::LocalFs(root) => { + info!("Using fs root '{}' as a remote storage", root.display()); + LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local) + } + RemoteStorageKind::AwsS3(s3_config) => { + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3) + } + } + } +} + +/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. +/// Immutable, cannot be changed once the file is created. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StorageMetadata(HashMap); + +fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { + if prefix == path { + anyhow::bail!( + "Prefix and the path are equal, cannot strip: '{}'", + prefix.display() + ) + } else { + path.strip_prefix(prefix).with_context(|| { + format!( + "Path '{}' is not prefixed with '{}'", + path.display(), + prefix.display(), + ) + }) + } +} + +/// External backup storage configuration, enough for creating a client for that storage. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RemoteStorageConfig { + /// Max allowed number of concurrent sync operations between the API user and the remote storage. + pub max_concurrent_syncs: NonZeroUsize, + /// Max allowed errors before the sync task is considered failed and evicted. + pub max_sync_errors: NonZeroU32, + /// The storage connection configuration. + pub storage: RemoteStorageKind, +} + +/// A kind of a remote storage to connect to, with its connection configuration. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), +} + +/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq)] +pub struct S3Config { + /// Name of the bucket to connect to. + pub bucket_name: String, + /// The region where the bucket is located at. + pub bucket_region: String, + /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. + pub prefix_in_bucket: Option, + /// A base URL to send S3 requests to. + /// By default, the endpoint is derived from a region name, assuming it's + /// an AWS S3 region name, erroring on wrong region name. + /// Endpoint provides a way to support other S3 flavors and their regions. + /// + /// Example: `http://127.0.0.1:5000` + pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + pub concurrency_limit: NonZeroUsize, +} + +impl std::fmt::Debug for S3Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("S3Config") + .field("bucket_name", &self.bucket_name) + .field("bucket_region", &self.bucket_region) + .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) + .finish() + } +} + +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + } +} diff --git a/pageserver/src/remote_storage/local_fs.rs b/libs/remote_storage/src/local_fs.rs similarity index 81% rename from pageserver/src/remote_storage/local_fs.rs rename to libs/remote_storage/src/local_fs.rs index 6772a4fbd6..50243352ee 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -1,7 +1,7 @@ //! Local filesystem acting as a remote storage. -//! Multiple pageservers can use the same "storage" of this kind by using different storage roots. +//! Multiple API users can use the same "storage" of this kind by using different storage roots. //! -//! This storage used in pageserver tests, but can also be used in cases when a certain persistent +//! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. use std::{ @@ -17,18 +17,18 @@ use tokio::{ }; use tracing::*; -use crate::remote_storage::storage_sync::path_with_suffix_extension; +use crate::path_with_suffix_extension; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; pub struct LocalFs { - pageserver_workdir: &'static Path, - root: PathBuf, + working_directory: PathBuf, + storage_root: PathBuf, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. - pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result { + pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result { if !root.exists() { std::fs::create_dir_all(&root).with_context(|| { format!( @@ -38,15 +38,15 @@ impl LocalFs { })?; } Ok(Self { - pageserver_workdir, - root, + working_directory, + storage_root: root, }) } fn resolve_in_storage(&self, path: &Path) -> anyhow::Result { if path.is_relative() { - Ok(self.root.join(path)) - } else if path.starts_with(&self.root) { + Ok(self.storage_root.join(path)) + } else if path.starts_with(&self.storage_root) { Ok(path.to_path_buf()) } else { bail!( @@ -85,30 +85,30 @@ impl LocalFs { #[async_trait::async_trait] impl RemoteStorage for LocalFs { - type StoragePath = PathBuf; + type RemoteObjectId = PathBuf; - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - Ok(self.root.join( - strip_path_prefix(self.pageserver_workdir, local_path) + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + Ok(self.storage_root.join( + strip_path_prefix(&self.working_directory, local_path) .context("local path does not belong to this storage")?, )) } - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.root, storage_path) + fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { + let relative_path = strip_path_prefix(&self.storage_root, storage_path) .context("local path does not belong to this storage")?; - Ok(self.pageserver_workdir.join(relative_path)) + Ok(self.working_directory.join(relative_path)) } - async fn list(&self) -> anyhow::Result> { - get_all_files(&self.root).await + async fn list(&self) -> anyhow::Result> { + get_all_files(&self.storage_root).await } async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, - to: &Self::StoragePath, + to: &Self::RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; @@ -194,7 +194,7 @@ impl RemoteStorage for LocalFs { async fn download( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result> { let file_path = self.resolve_in_storage(from)?; @@ -229,9 +229,9 @@ impl RemoteStorage for LocalFs { } } - async fn download_range( + async fn download_byte_range( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), @@ -288,7 +288,7 @@ impl RemoteStorage for LocalFs { } } - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { let file_path = self.resolve_in_storage(path)?; if file_path.exists() && file_path.is_file() { Ok(fs::remove_file(file_path).await?) @@ -354,29 +354,30 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> #[cfg(test)] mod pure_tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; + use tempfile::tempdir; use super::*; #[test] fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; + let workdir = tempdir()?.path().to_owned(); + let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), + working_directory: workdir.clone(), + storage_root: storage_root.clone(), }; - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name"); - let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?); + let local_path = workdir + .join("timelines") + .join("some_timeline") + .join("file_name"); + let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); assert_eq!( expected_path, - storage.storage_path(&local_path).expect("Matching path should map to storage path normally"), - "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir" + storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"), + "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" ); Ok(()) @@ -386,7 +387,7 @@ mod pure_tests { fn storage_path_negatives() -> anyhow::Result<()> { #[track_caller] fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { + match storage.remote_object_id(mismatching_path) { Ok(wrong_path) => panic!( "Expected path '{}' to error, but got storage path: {:?}", mismatching_path.display(), @@ -396,16 +397,16 @@ mod pure_tests { } } - let repo_harness = RepoHarness::create("storage_path_negatives")?; + let workdir = tempdir()?.path().to_owned(); let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: workdir.clone(), + storage_root, }; - let error_string = storage_path_error(&storage, &repo_harness.conf.workdir); + let error_string = storage_path_error(&storage, &workdir); assert!(error_string.contains("does not belong to this storage")); - assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap())); + assert!(error_string.contains(workdir.to_str().unwrap())); let mismatching_path_str = "/something/else"; let error_message = storage_path_error(&storage, Path::new(mismatching_path_str)); @@ -414,7 +415,7 @@ mod pure_tests { "Error should mention wrong path" ); assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), + error_message.contains(workdir.to_str().unwrap()), "Error should mention server workdir" ); assert!(error_message.contains("does not belong to this storage")); @@ -424,29 +425,28 @@ mod pure_tests { #[test] fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; + let workdir = tempdir()?.path().to_owned(); let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), + working_directory: workdir.clone(), + storage_root: storage_root.clone(), }; let name = "not a metadata"; - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name); + let local_path = workdir.join("timelines").join("some_timeline").join(name); assert_eq!( local_path, storage - .local_path( - &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?) - ) + .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?)) .expect("For a valid input, valid local path should be parsed"), "Should be able to parse metadata out of the correctly named remote delta file" ); - let local_metadata_path = repo_harness - .timeline_path(&TIMELINE_ID) - .join(METADATA_FILE_NAME); - let remote_metadata_path = storage.storage_path(&local_metadata_path)?; + let local_metadata_path = workdir + .join("timelines") + .join("some_timeline") + .join("metadata"); + let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?; assert_eq!( local_metadata_path, storage @@ -472,11 +472,10 @@ mod pure_tests { } } - let repo_harness = RepoHarness::create("local_path_negatives")?; let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: tempdir()?.path().to_owned(), + storage_root, }; let totally_wrong_path = "wrong_wrong_wrong"; @@ -488,16 +487,19 @@ mod pure_tests { #[test] fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); let storage_root = PathBuf::from("somewhere").join("else"); let dummy_storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: workdir, + storage_root, }; - let storage_path = dummy_storage.storage_path(&original_path)?; + let storage_path = dummy_storage.remote_object_id(&original_path)?; let download_destination = dummy_storage.local_path(&storage_path)?; assert_eq!( @@ -512,18 +514,17 @@ mod pure_tests { #[cfg(test)] mod fs_tests { use super::*; - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; use std::{collections::HashMap, io::Write}; use tempfile::tempdir; #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("upload_file")?; + let workdir = tempdir()?.path().to_owned(); let storage = create_storage()?; let (file, size) = create_file_for_upload( - &storage.pageserver_workdir.join("whatever"), + &storage.working_directory.join("whatever"), "whatever_contents", ) .await?; @@ -538,14 +539,14 @@ mod fs_tests { } assert!(storage.list().await?.is_empty()); - let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?; assert_eq!( storage.list().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -556,17 +557,16 @@ mod fs_tests { } fn create_storage() -> anyhow::Result { - let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned())); - let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?; - Ok(storage) + LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage.download(&upload_target, &mut content_bytes).await?; @@ -597,14 +597,15 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_positive")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range(&upload_target, 0, None, &mut full_range_bytes) + .download_byte_range(&upload_target, 0, None, &mut full_range_bytes) .await?; assert!( metadata.is_none(), @@ -620,7 +621,7 @@ mod fs_tests { let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let same_byte = 1_000_000_000; let metadata = storage - .download_range( + .download_byte_range( &upload_target, same_byte, Some(same_byte + 1), // exclusive end @@ -642,7 +643,7 @@ mod fs_tests { let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range( + .download_byte_range( &upload_target, 0, Some(first_part_local.len() as u64), @@ -664,7 +665,7 @@ mod fs_tests { let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range( + .download_byte_range( &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), @@ -689,16 +690,17 @@ mod fs_tests { #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_negative")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let start = 10000; let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_range(&upload_target, start, Some(end), &mut io::sink()) + .download_byte_range(&upload_target, start, Some(end), &mut io::sink()) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -712,7 +714,7 @@ mod fs_tests { let non_existing_path = PathBuf::from("somewhere").join("else"); match storage - .download_range(&non_existing_path, 1, Some(3), &mut io::sink()) + .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink()) .await { Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"), @@ -727,10 +729,11 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("delete_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; storage.delete(&upload_target).await?; assert!(storage.list().await?.is_empty()); @@ -748,7 +751,8 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ @@ -756,7 +760,7 @@ mod fs_tests { ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?; @@ -780,7 +784,7 @@ mod fs_tests { let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let partial_download_metadata = storage - .download_range( + .download_byte_range( &upload_target, 0, Some(first_part_local.len() as u64), @@ -805,16 +809,16 @@ mod fs_tests { } async fn upload_dummy_file( - harness: &RepoHarness<'_>, + workdir: &Path, storage: &LocalFs, name: &str, metadata: Option, ) -> anyhow::Result { - let timeline_path = harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?; - let storage_path = storage.root.join(relative_timeline_path).join(name); + let timeline_path = workdir.join("timelines").join("some_timeline"); + let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; + let storage_path = storage.storage_root.join(relative_timeline_path).join(name); - let from_path = storage.pageserver_workdir.join(name); + let from_path = storage.working_directory.join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; storage.upload(file, size, &storage_path, metadata).await?; Ok(storage_path) diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs similarity index 74% rename from pageserver/src/remote_storage/s3_bucket.rs rename to libs/remote_storage/src/s3_bucket.rs index 73d828d150..01aaf7ca7e 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -1,7 +1,7 @@ //! AWS S3 storage wrapper around `rusoto` library. //! //! Respects `prefix_in_bucket` property from [`S3Config`], -//! allowing multiple pageservers to independently work with the same S3 bucket, if +//! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. use std::path::{Path, PathBuf}; @@ -19,16 +19,13 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{ - config::S3Config, - remote_storage::{strip_path_prefix, RemoteStorage}, -}; +use crate::{strip_path_prefix, RemoteStorage, S3Config}; use super::StorageMetadata; -const S3_FILE_SEPARATOR: char = '/'; +const S3_PREFIX_SEPARATOR: char = '/'; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct S3ObjectKey(String); impl S3ObjectKey { @@ -36,11 +33,7 @@ impl S3ObjectKey { &self.0 } - fn download_destination( - &self, - pageserver_workdir: &Path, - prefix_to_strip: Option<&str>, - ) -> PathBuf { + fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf { let path_without_prefix = match prefix_to_strip { Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| { panic!( @@ -51,9 +44,9 @@ impl S3ObjectKey { None => &self.0, }; - pageserver_workdir.join( + workdir.join( path_without_prefix - .split(S3_FILE_SEPARATOR) + .split(S3_PREFIX_SEPARATOR) .collect::(), ) } @@ -61,7 +54,7 @@ impl S3ObjectKey { /// AWS S3 storage. pub struct S3Bucket { - pageserver_workdir: &'static Path, + workdir: PathBuf, client: S3Client, bucket_name: String, prefix_in_bucket: Option, @@ -73,7 +66,7 @@ pub struct S3Bucket { impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result { + pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result { debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name @@ -89,8 +82,11 @@ impl S3Bucket { .context("Failed to parse the s3 region from config")?, }; let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?; - let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none() - { + + let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok(); + let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok(); + + let client = if access_key_id.is_none() && secret_access_key.is_none() { debug!("Using IAM-based AWS access"); S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) } else { @@ -98,8 +94,8 @@ impl S3Bucket { S3Client::new_with( request_dispatcher, StaticProvider::new_minimal( - aws_config.access_key_id.clone().unwrap_or_default(), - aws_config.secret_access_key.clone().unwrap_or_default(), + access_key_id.unwrap_or_default(), + secret_access_key.unwrap_or_default(), ), region, ) @@ -107,12 +103,12 @@ impl S3Bucket { let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { let mut prefix = prefix; - while prefix.starts_with(S3_FILE_SEPARATOR) { + while prefix.starts_with(S3_PREFIX_SEPARATOR) { prefix = &prefix[1..] } let mut prefix = prefix.to_string(); - while prefix.ends_with(S3_FILE_SEPARATOR) { + while prefix.ends_with(S3_PREFIX_SEPARATOR) { prefix.pop(); } prefix @@ -120,7 +116,7 @@ impl S3Bucket { Ok(Self { client, - pageserver_workdir, + workdir, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), @@ -130,24 +126,23 @@ impl S3Bucket { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - type StoragePath = S3ObjectKey; + type RemoteObjectId = S3ObjectKey; - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?; + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let relative_path = strip_path_prefix(&self.workdir, local_path)?; let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); for segment in relative_path { - key.push(S3_FILE_SEPARATOR); + key.push(S3_PREFIX_SEPARATOR); key.push_str(&segment.to_string_lossy()); } Ok(S3ObjectKey(key)) } - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - Ok(storage_path - .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref())) + fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { + Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref())) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { let mut document_keys = Vec::new(); let mut continuation_token = None; @@ -187,7 +182,7 @@ impl RemoteStorage for S3Bucket { &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, - to: &Self::StoragePath, + to: &Self::RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let _guard = self @@ -212,7 +207,7 @@ impl RemoteStorage for S3Bucket { async fn download( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result> { let _guard = self @@ -237,9 +232,9 @@ impl RemoteStorage for S3Bucket { Ok(object_output.metadata.map(StorageMetadata)) } - async fn download_range( + async fn download_byte_range( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), @@ -274,7 +269,7 @@ impl RemoteStorage for S3Bucket { Ok(object_output.metadata.map(StorageMetadata)) } - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { let _guard = self .concurrency_limiter .acquire() @@ -293,34 +288,30 @@ impl RemoteStorage for S3Bucket { #[cfg(test)] mod tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; + use tempfile::tempdir; use super::*; #[test] fn download_destination() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination")?; - - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name"); - let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?; + let workdir = tempdir()?.path().to_owned(); + let local_path = workdir.join("one").join("two").join("test_name"); + let relative_path = local_path.strip_prefix(&workdir)?; let key = S3ObjectKey(format!( "{}{}", - S3_FILE_SEPARATOR, + S3_PREFIX_SEPARATOR, relative_path .iter() .map(|segment| segment.to_str().unwrap()) .collect::>() - .join(&S3_FILE_SEPARATOR.to_string()), + .join(&S3_PREFIX_SEPARATOR.to_string()), )); assert_eq!( local_path, - key.download_destination(&repo_harness.conf.workdir, None), - "Download destination should consist of s3 path joined with the pageserver workdir prefix" + key.download_destination(&workdir, None), + "Download destination should consist of s3 path joined with the workdir prefix" ); Ok(()) @@ -328,24 +319,21 @@ mod tests { #[test] fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; + let workdir = tempdir()?.path().to_owned(); let segment_1 = "matching"; let segment_2 = "file"; - let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2); + let local_path = &workdir.join(segment_1).join(segment_2); - let storage = dummy_storage(&repo_harness.conf.workdir); + let storage = dummy_storage(workdir); let expected_key = S3ObjectKey(format!( - "{}{SEPARATOR}{}{SEPARATOR}{}", + "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}", storage.prefix_in_bucket.as_deref().unwrap_or_default(), - segment_1, - segment_2, - SEPARATOR = S3_FILE_SEPARATOR, )); let actual_key = storage - .storage_path(local_path) + .remote_object_id(local_path) .expect("Matching path should map to S3 path normally"); assert_eq!( expected_key, @@ -360,7 +348,7 @@ mod tests { fn storage_path_negatives() -> anyhow::Result<()> { #[track_caller] fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { + match storage.remote_object_id(mismatching_path) { Ok(wrong_key) => panic!( "Expected path '{}' to error, but got S3 key: {:?}", mismatching_path.display(), @@ -370,10 +358,10 @@ mod tests { } } - let repo_harness = RepoHarness::create("storage_path_negatives")?; - let storage = dummy_storage(&repo_harness.conf.workdir); + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); - let error_message = storage_path_error(&storage, &repo_harness.conf.workdir); + let error_message = storage_path_error(&storage, &workdir); assert!( error_message.contains("Prefix and the path are equal"), "Message '{}' does not contain the required string", @@ -387,7 +375,7 @@ mod tests { "Error should mention wrong path" ); assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), + error_message.contains(workdir.to_str().unwrap()), "Error should mention server workdir" ); assert!( @@ -401,20 +389,17 @@ mod tests { #[test] fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; - let storage = dummy_storage(&repo_harness.conf.workdir); - let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?; + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); + let timeline_dir = workdir.join("timelines").join("test_timeline"); + let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?; let s3_key = create_s3_key( &relative_timeline_path.join("not a metadata"), storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), + s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -422,14 +407,11 @@ mod tests { ); let s3_key = create_s3_key( - &relative_timeline_path.join(METADATA_FILE_NAME), + &relative_timeline_path.join("metadata"), storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), + s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -441,12 +423,15 @@ mod tests { #[test] fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); - let dummy_storage = dummy_storage(&repo_harness.conf.workdir); + let dummy_storage = dummy_storage(workdir); - let key = dummy_storage.storage_path(&original_path)?; + let key = dummy_storage.remote_object_id(&original_path)?; let download_destination = dummy_storage.local_path(&key)?; assert_eq!( @@ -457,9 +442,9 @@ mod tests { Ok(()) } - fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket { + fn dummy_storage(workdir: PathBuf) -> S3Bucket { S3Bucket { - pageserver_workdir, + workdir, client: S3Client::new("us-east-1".parse().unwrap()), bucket_name: "dummy-bucket".to_string(), prefix_in_bucket: Some("dummy_prefix/".to_string()), @@ -471,7 +456,7 @@ mod tests { S3ObjectKey(relative_file_path.iter().fold( prefix.unwrap_or_default().to_string(), |mut path_string, segment| { - path_string.push(S3_FILE_SEPARATOR); + path_string.push(S3_PREFIX_SEPARATOR); path_string.push_str(segment.to_str().unwrap()); path_string }, diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index fab3c388b1..857df0ec84 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -433,7 +433,12 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); + if query_string.starts_with("callmemaybe") { + // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed + error!("query handler for '{}' failed: {}", query_string, e); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index e1677f4311..ce86cf8c91 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -503,6 +503,18 @@ impl RowDescriptor<'_> { formatcode: 0, } } + + pub const fn text_col(name: &[u8]) -> RowDescriptor { + RowDescriptor { + name, + tableoid: 0, + attnum: 0, + typoid: TEXT_OID, + typlen: -1, + typmod: 0, + formatcode: 0, + } + } } #[derive(Debug)] diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index fce5ed97c1..44d81cda50 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -224,7 +224,7 @@ impl fmt::Display for ZTenantTimelineId { // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued // by the console. -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] pub struct ZNodeId(pub u64); diff --git a/zenith/Cargo.toml b/neon_local/Cargo.toml similarity index 92% rename from zenith/Cargo.toml rename to neon_local/Cargo.toml index 0f72051f74..78d339789f 100644 --- a/zenith/Cargo.toml +++ b/neon_local/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "zenith" +name = "neon_local" version = "0.1.0" edition = "2021" @@ -7,6 +7,7 @@ edition = "2021" clap = "3.0" anyhow = "1.0" serde_json = "1" +comfy-table = "5.0.1" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } # FIXME: 'pageserver' is needed for BranchInfo. Refactor diff --git a/zenith/src/main.rs b/neon_local/src/main.rs similarity index 94% rename from zenith/src/main.rs rename to neon_local/src/main.rs index cd0cf470e8..8b54054080 100644 --- a/zenith/src/main.rs +++ b/neon_local/src/main.rs @@ -62,15 +62,15 @@ http_port = {safekeeper_http_port} struct TimelineTreeEl { /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. pub info: TimelineInfo, - /// Name, recovered from zenith config mappings + /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: BTreeSet, } -// Main entry point for the 'zenith' CLI utility +// Main entry point for the 'neon_local' CLI utility // -// This utility helps to manage zenith installation. That includes following: +// This utility helps to manage neon installation. That includes following: // * Management of local postgres installations running on top of the // pageserver. // * Providing CLI api to the pageserver @@ -125,12 +125,12 @@ fn main() -> Result<()> { .takes_value(true) .required(false); - let matches = App::new("Zenith CLI") + let matches = App::new("Neon CLI") .setting(AppSettings::ArgRequiredElseHelp) .version(GIT_VERSION) .subcommand( App::new("init") - .about("Initialize a new Zenith repository") + .about("Initialize a new Neon repository") .arg(pageserver_config_args.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( @@ -258,7 +258,7 @@ fn main() -> Result<()> { None => bail!("no subcommand provided"), }; - // Check for 'zenith init' command first. + // Check for 'neon init' command first. let subcommand_result = if sub_name == "init" { handle_init(sub_args).map(Some) } else { @@ -481,9 +481,8 @@ fn handle_init(init_match: &ArgMatches) -> Result { }; let mut env = - LocalEnv::create_config(&toml_file).context("Failed to create zenith configuration")?; - env.init() - .context("Failed to initialize zenith repository")?; + LocalEnv::create_config(&toml_file).context("Failed to create neon configuration")?; + env.init().context("Failed to initialize neon repository")?; // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); @@ -518,7 +517,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { let pageserver = PageServerNode::from_env(env); match tenant_match.subcommand() { Some(("list", _)) => { @@ -551,17 +550,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re pageserver .tenant_config(tenant_id, tenant_conf) - .unwrap_or_else(|e| { - anyhow!( - "Tenant config failed for tenant with id {} : {}", - tenant_id, - e - ); - }); - println!( - "tenant {} successfully configured on the pageserver", - tenant_id - ); + .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; + println!("tenant {tenant_id} successfully configured on the pageserver"); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -665,35 +655,56 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let timeline_name_mappings = env.timeline_name_mappings(); - println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS"); + let mut table = comfy_table::Table::new(); + + table.load_preset(comfy_table::presets::NOTHING); + + table.set_header(&[ + "NODE", + "ADDRESS", + "TIMELINE", + "BRANCH NAME", + "LSN", + "STATUS", + ]); + for ((_, node_name), node) in cplane .nodes .iter() .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) { - // FIXME: This shows the LSN at the end of the timeline. It's not the - // right thing to do for read-only nodes that might be anchored at an - // older point in time, or following but lagging behind the primary. - let lsn_str = timeline_infos - .get(&node.timeline_id) - .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) - .unwrap_or_else(|| "?".to_string()); + let lsn_str = match node.lsn { + None => { + // -> primary node + // Use the LSN at the end of the timeline. + timeline_infos + .get(&node.timeline_id) + .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) + .unwrap_or_else(|| "?".to_string()) + } + Some(lsn) => { + // -> read-only node + // Use the node's LSN. + lsn.to_string() + } + }; let branch_name = timeline_name_mappings .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); - println!( - "{}\t{}\t{}\t{}\t{}\t{}", - node_name, - node.address, - node.timeline_id, + table.add_row(&[ + node_name.as_str(), + &node.address.to_string(), + &node.timeline_id.to_string(), branch_name, - lsn_str, + lsn_str.as_str(), node.status(), - ); + ]); } + + println!("{table}"); } "create" => { let branch_name = sub_args diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 943e724c70..fe692f4304 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] # It is simpler infra-wise to have failpoints enabled by default -# It shouldnt affect perf in any way because failpoints +# It shouldn't affect perf in any way because failpoints # are not placed in hot code paths default = ["failpoints"] profiling = ["pprof"] @@ -25,7 +25,6 @@ lazy_static = "1.4.0" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -tokio-util = { version = "0.7", features = ["io"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -54,16 +53,13 @@ once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" -rusoto_core = "0.47" -rusoto_s3 = "0.47" -async-trait = "0.1" - # 'experimental' is needed for the `zstd::bulk::Decompressor::upper_bound` function. zstd = { version = "0.11.1", features = ["experimental"] } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } +remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/pageserver/README.md b/pageserver/README.md index 1fd627785c..cf841d1e46 100644 --- a/pageserver/README.md +++ b/pageserver/README.md @@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a CLI examples: * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` -* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"` +* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. For local S3 installations, refer to the their documentation for name format and credentials. @@ -155,11 +155,9 @@ or bucket_name = 'some-sample-bucket' bucket_region = 'eu-north-1' prefix_in_bucket = '/test_prefix/' -access_key_id = 'SOMEKEYAAAAASADSAH*#' -secret_access_key = 'SOMEsEcReTsd292v' ``` -Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above. +`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. TODO: Sharding -------------------- diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 78a27e460f..92d35130d8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,7 +10,7 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{ensure, Context, Result}; +use anyhow::{anyhow, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; use std::fmt::Write as FmtWrite; use std::io; @@ -154,9 +154,17 @@ impl<'a> Basebackup<'a> { let img = self .timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; - ensure!(img.len() == pg_constants::BLCKSZ as usize); - slru_buf.extend_from_slice(&img); + if slru == SlruKind::Clog { + ensure!( + img.len() == pg_constants::BLCKSZ as usize + || img.len() == pg_constants::BLCKSZ as usize + 8 + ); + } else { + ensure!(img.len() == pg_constants::BLCKSZ as usize); + } + + slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); @@ -315,7 +323,8 @@ impl<'a> Basebackup<'a> { let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier); + let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 01fcc1224f..9cb7e6f13d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; +use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, @@ -84,8 +85,23 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg( + Arg::new("enabled-features") + .long("enabled-features") + .takes_value(false) + .help("Show enabled compile time features"), + ) .get_matches(); + if arg_matches.is_present("enabled-features") { + let features: &[&str] = &[ + #[cfg(feature = "failpoints")] + "failpoints", + ]; + println!("{{\"features\": {features:?} }}"); + return Ok(()); + } + let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); let workdir = workdir .canonicalize() @@ -166,6 +182,14 @@ fn main() -> anyhow::Result<()> { // as a ref. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + // If failpoints are used, terminate the whole pageserver process if they are hit. + let scenario = FailScenario::setup(); + if fail::has_failpoints() { + std::panic::set_hook(Box::new(|_| { + std::process::exit(1); + })); + } + // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); page_cache::init(conf.page_cache_size); @@ -181,10 +205,12 @@ fn main() -> anyhow::Result<()> { cfg_file_path.display() ) })?; - Ok(()) } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver") + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; } + + scenario.teardown(); + Ok(()) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { @@ -261,7 +287,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "http_endpoint_thread", - false, + true, move || { let router = http::make_router(conf, auth_cloned, remote_index)?; endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) @@ -275,7 +301,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "libpq endpoint thread", - false, + true, move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), )?; @@ -295,7 +321,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(); + pageserver::shutdown_pageserver(0); unreachable!() } }) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 4ed2cd3842..9ee2ca468e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,6 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; +use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; use std::env; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::{Path, PathBuf}; @@ -38,18 +39,6 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; - /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. - /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency - /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. - /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. - pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50; - pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; - /// Currently, sync happens with AWS S3, that has two limits on requests per second: - /// ~200 RPS for IAM services - /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html - /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests - /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ - pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -320,67 +309,6 @@ impl PageServerConfigBuilder { } } -/// External backup storage configuration, enough for creating a client for that storage. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RemoteStorageConfig { - /// Max allowed number of concurrent sync operations between pageserver and the remote storage. - pub max_concurrent_timelines_sync: NonZeroUsize, - /// Max allowed errors before the sync task is considered failed and evicted. - pub max_sync_errors: NonZeroU32, - /// The storage connection configuration. - pub storage: RemoteStorageKind, -} - -/// A kind of a remote storage to connect to, with its connection configuration. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RemoteStorageKind { - /// Storage based on local file system. - /// Specify a root folder to place all stored files into. - LocalFs(PathBuf), - /// AWS S3 based storage, storing all files in the S3 bucket - /// specified by the config - AwsS3(S3Config), -} - -/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct S3Config { - /// Name of the bucket to connect to. - pub bucket_name: String, - /// The region where the bucket is located at. - pub bucket_region: String, - /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once. - pub prefix_in_bucket: Option, - /// "Login" to use when connecting to bucket. - /// Can be empty for cases like AWS k8s IAM - /// where we can allow certain pods to connect - /// to the bucket directly without any credentials. - pub access_key_id: Option, - /// "Password" to use when connecting to bucket. - pub secret_access_key: Option, - /// A base URL to send S3 requests to. - /// By default, the endpoint is derived from a region name, assuming it's - /// an AWS S3 region name, erroring on wrong region name. - /// Endpoint provides a way to support other S3 flavors and their regions. - /// - /// Example: `http://127.0.0.1:5000` - pub endpoint: Option, - /// AWS S3 has various limits on its API calls, we need not to exceed those. - /// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, -} - -impl std::fmt::Debug for S3Config { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("S3Config") - .field("bucket_name", &self.bucket_name) - .field("bucket_region", &self.bucket_region) - .field("prefix_in_bucket", &self.prefix_in_bucket) - .field("concurrency_limit", &self.concurrency_limit) - .finish() - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -528,21 +456,21 @@ impl PageServerConf { let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); - let max_concurrent_timelines_sync = NonZeroUsize::new( - parse_optional_integer("max_concurrent_timelines_sync", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC), + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), ) - .context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?; + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; let max_sync_errors = NonZeroU32::new( parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), ) .context("Failed to parse 'max_sync_errors' as a positive integer")?; let concurrency_limit = NonZeroUsize::new( parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; @@ -557,16 +485,6 @@ impl PageServerConf { (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { bucket_name: parse_toml_string("bucket_name", bucket_name)?, bucket_region: parse_toml_string("bucket_region", bucket_region)?, - access_key_id: toml - .get("access_key_id") - .map(|access_key_id| parse_toml_string("access_key_id", access_key_id)) - .transpose()?, - secret_access_key: toml - .get("secret_access_key") - .map(|secret_access_key| { - parse_toml_string("secret_access_key", secret_access_key) - }) - .transpose()?, prefix_in_bucket: toml .get("prefix_in_bucket") .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) @@ -584,7 +502,7 @@ impl PageServerConf { }; Ok(RemoteStorageConfig { - max_concurrent_timelines_sync, + max_concurrent_syncs, max_sync_errors, storage, }) @@ -812,11 +730,11 @@ pg_distrib_dir='{}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_timelines_sync: NonZeroUsize::new( - defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC + max_concurrent_syncs: NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS ) .unwrap(), - max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) + max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) .unwrap(), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), }, @@ -834,29 +752,25 @@ pg_distrib_dir='{}' let bucket_name = "some-sample-bucket".to_string(); let bucket_region = "eu-north-1".to_string(); let prefix_in_bucket = "test_prefix".to_string(); - let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string(); - let secret_access_key = "SOMEsEcReTsd292v".to_string(); let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap(); + let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); let identical_toml_declarations = &[ format!( r#"[remote_storage] -max_concurrent_timelines_sync = {max_concurrent_timelines_sync} +max_concurrent_syncs = {max_concurrent_syncs} max_sync_errors = {max_sync_errors} bucket_name = '{bucket_name}' bucket_region = '{bucket_region}' prefix_in_bucket = '{prefix_in_bucket}' -access_key_id = '{access_key_id}' -secret_access_key = '{secret_access_key}' endpoint = '{endpoint}' concurrency_limit = {s3_concurrency_limit}"# ), format!( - "remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", + "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ + bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", ), ]; @@ -879,13 +793,11 @@ pg_distrib_dir='{}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_timelines_sync, + max_concurrent_syncs, max_sync_errors, storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: bucket_name.clone(), bucket_region: bucket_region.clone(), - access_key_id: Some(access_key_id.clone()), - secret_access_key: Some(secret_access_key.clone()), prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5903dea372..0104df826e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,17 +3,16 @@ use std::sync::Arc; use anyhow::{Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; +use remote_storage::GenericRemoteStorage; use tracing::*; use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, }; -use crate::config::RemoteStorageKind; -use crate::remote_storage::{ - download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, -}; use crate::repository::Repository; +use crate::storage_sync; +use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines}; @@ -37,11 +36,6 @@ struct State { remote_storage: Option, } -enum GenericRemoteStorage { - Local(LocalFs), - S3(S3Bucket), -} - impl State { fn new( conf: &'static PageServerConf, @@ -57,14 +51,7 @@ impl State { let remote_storage = conf .remote_storage_config .as_ref() - .map(|storage_config| match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local) - } - RemoteStorageKind::AwsS3(s3_config) => { - S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3) - } - }) + .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) .transpose() .context("Failed to init generic remote storage")?; @@ -273,14 +260,14 @@ async fn timeline_attach_handler(request: Request) -> Result { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await @@ -309,35 +296,32 @@ async fn timeline_attach_handler(request: Request) -> Result index_accessor.add_timeline_entry(sync_id, new_timeline), } - schedule_timeline_download(tenant_id, timeline_id); + storage_sync::schedule_layer_download(tenant_id, timeline_id); json_response(StatusCode::ACCEPTED, ()) } -async fn try_download_shard_data( +async fn try_download_index_part_data( state: &State, sync_id: ZTenantTimelineId, ) -> anyhow::Result> { - let shard = match state.remote_storage.as_ref() { + let index_part = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { - download_index_part(state.conf, local_storage, sync_id).await + storage_sync::download_index_part(state.conf, local_storage, sync_id).await } Some(GenericRemoteStorage::S3(s3_storage)) => { - download_index_part(state.conf, s3_storage, sync_id).await + storage_sync::download_index_part(state.conf, s3_storage, sync_id).await } None => return Ok(None), } - .with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?; + .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?; let timeline_path = state .conf .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - RemoteTimeline::from_index_part(&timeline_path, shard) + RemoteTimeline::from_index_part(&timeline_path, index_part) .map(Some) .with_context(|| { - format!( - "Failed to convert index shard into remote timeline for timeline {}", - sync_id - ) + format!("Failed to convert index part into remote timeline for timeline {sync_id}") }) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 8f49903e6c..703ee8f1b1 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -274,7 +274,7 @@ fn import_control_file( // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&buffer)?; - let checkpoint_bytes = pg_control.checkPointCopy.encode(); + let checkpoint_bytes = pg_control.checkPointCopy.encode()?; modification.put_checkpoint(checkpoint_bytes)?; Ok(pg_control) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 0d1d91d139..facc484c0a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -21,8 +21,8 @@ use utils::bin_ser::BeSer; use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; +use std::collections::{BTreeSet, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; @@ -36,10 +36,9 @@ use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config; use crate::config::PageServerConf; use crate::keyspace::KeySpace; +use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::page_cache; -use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, }; @@ -50,6 +49,7 @@ use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; +use crate::{page_cache, storage_sync}; use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, @@ -395,9 +395,22 @@ impl Repository for LayeredRepository { fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { let mut timelines = self.timelines.lock().unwrap(); + // check no child timelines, because detach will remove files, which will brake child branches + // FIXME this can still be violated because we do not guarantee + // that all ancestors are downloaded/attached to the same pageserver + let num_children = timelines + .iter() + .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) + .count(); + + ensure!( + num_children == 0, + "Cannot detach timeline which has child timelines" + ); + ensure!( timelines.remove(&timeline_id).is_some(), - "cannot detach timeline {timeline_id} that is not available locally" + "Cannot detach timeline {timeline_id} that is not available locally" ); Ok(()) } @@ -417,7 +430,7 @@ impl Repository for LayeredRepository { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), Entry::Vacant(entry) => { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = Self::load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) }, @@ -444,7 +457,7 @@ enum LayeredTimelineEntry { impl LayeredTimelineEntry { fn timeline_id(&self) -> ZTimelineId { match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid, + LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, LayeredTimelineEntry::Unloaded { id, .. } => *id, } } @@ -604,21 +617,17 @@ impl LayeredRepository { fn load_local_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result> { - let metadata = Self::load_metadata(self.conf, timelineid, self.tenant_id) + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); let ancestor = metadata .ancestor_timeline() .map(|ancestor_timeline_id| { - trace!( - "loading {}'s ancestor {}", - timelineid, - &ancestor_timeline_id - ); + trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id); self.get_timeline_load_internal(ancestor_timeline_id, timelines) }) .transpose() @@ -632,7 +641,7 @@ impl LayeredRepository { Arc::clone(&self.tenant_conf), metadata, ancestor, - timelineid, + timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, @@ -765,17 +774,6 @@ impl LayeredRepository { Ok(()) } - fn load_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - ) -> Result { - let path = metadata_path(conf, timelineid, tenantid); - info!("loading metadata from {}", path.display()); - let metadata_bytes = std::fs::read(&path)?; - TimelineMetadata::from_bytes(&metadata_bytes) - } - // // How garbage collection works: // @@ -902,8 +900,8 @@ pub struct LayeredTimeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, layers: RwLock, @@ -1177,50 +1175,50 @@ impl LayeredTimeline { tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let flush_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "layer flush", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let compact_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "compact", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let create_images_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "create images", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); LayeredTimeline { conf, tenant_conf, - timelineid, - tenantid, + timeline_id, + tenant_id, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1272,7 +1270,7 @@ impl LayeredTimeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid); + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; @@ -1284,7 +1282,7 @@ impl LayeredTimeline { if imgfilename.lsn > disk_consistent_lsn { warn!( "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timelineid, disk_consistent_lsn + imgfilename, self.timeline_id, disk_consistent_lsn ); rename_to_backup(direntry.path())?; @@ -1292,7 +1290,7 @@ impl LayeredTimeline { } let layer = - ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename); + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); trace!("found layer {}", layer.filename().display()); layers.insert_historic(Arc::new(layer)); @@ -1307,7 +1305,7 @@ impl LayeredTimeline { if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { warn!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timelineid, disk_consistent_lsn + deltafilename, self.timeline_id, disk_consistent_lsn ); rename_to_backup(direntry.path())?; @@ -1315,7 +1313,7 @@ impl LayeredTimeline { } let layer = - DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename); + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); trace!("found layer {}", layer.filename().display()); layers.insert_historic(Arc::new(layer)); @@ -1434,7 +1432,8 @@ impl LayeredTimeline { let layers = timeline.layers.read().unwrap(); - // Check the open and frozen in-memory layers first + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { @@ -1452,7 +1451,7 @@ impl LayeredTimeline { continue; } } - for frozen_layer in layers.frozen_layers.iter() { + for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); @@ -1496,7 +1495,7 @@ impl LayeredTimeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = - cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?; + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) } @@ -1505,12 +1504,20 @@ impl LayeredTimeline { let ancestor = self .ancestor_timeline .as_ref() - .expect("there should be an ancestor") + .with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })? .ensure_loaded() .with_context(|| { format!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.get_ancestor_timeline_id().unwrap()) + "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) })?; Ok(Arc::clone(ancestor)) } @@ -1545,12 +1552,12 @@ impl LayeredTimeline { trace!( "creating layer for write at {}/{} for record at {}", - self.timelineid, + self.timeline_id, start_lsn, lsn ); let new_layer = - InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?; + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; let layer_rc = Arc::new(new_layer); layers.open_layer = Some(Arc::clone(&layer_rc)); @@ -1624,8 +1631,8 @@ impl LayeredTimeline { let self_clone = Arc::clone(self); thread_mgr::spawn( thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenantid), - Some(self.timelineid), + Some(self.tenant_id), + Some(self.timeline_id), "layer flush thread", false, move || self_clone.flush_frozen_layers(false), @@ -1694,10 +1701,13 @@ impl LayeredTimeline { // them all in parallel. par_fsync::par_fsync(&[ new_delta_path.clone(), - self.conf.timeline_path(&self.timelineid, &self.tenantid), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; + fail_point!("checkpoint-before-sync"); - // Finally, replace the frozen in-memory layer with the new on-disk layers + fail_point!("flush-frozen"); + + // Finally, replace the frozen in-memory layer with the new on-disk layer { let mut layers = self.layers.write().unwrap(); let l = layers.frozen_layers.pop_front(); @@ -1718,6 +1728,7 @@ impl LayeredTimeline { // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); + fail_point!("checkpoint-after-sync"); // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. @@ -1762,8 +1773,8 @@ impl LayeredTimeline { LayeredRepository::save_metadata( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &metadata, false, )?; @@ -1772,11 +1783,11 @@ impl LayeredTimeline { PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); if self.upload_layers.load(atomic::Ordering::Relaxed) { - schedule_timeline_checkpoint_upload( - self.tenantid, - self.timelineid, - new_delta_path, - metadata, + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from([new_delta_path]), + Some(metadata), ); } @@ -1827,7 +1838,8 @@ impl LayeredTimeline { let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed - if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenantid, self.timelineid) + if let Ok(pgdir) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { let (partitioning, lsn) = pgdir.repartition( self.get_last_record_lsn(), @@ -1836,11 +1848,21 @@ impl LayeredTimeline { let timer = self.create_images_time_histo.start_timer(); // 2. Create new image layers for partitions that have been modified // "enough". + let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len()); for part in partitioning.parts.iter() { if self.time_for_new_image_layer(part, lsn)? { - self.create_image_layer(part, lsn)?; + let new_path = self.create_image_layer(part, lsn)?; + layer_paths_to_upload.insert(new_path); } } + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + None, + ); + } timer.stop_and_record(); // 3. Compact @@ -1861,7 +1883,7 @@ impl LayeredTimeline { for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { - let img_lsn = if let Some(ref last_img) = last_img { + let img_lsn = if let Some(last_img) = last_img { last_img.get_lsn_range().end } else { Lsn(0) @@ -1882,11 +1904,11 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> { + fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { let img_range = partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = - ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; + ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?; for range in &partition.ranges { let mut key = range.start; @@ -1909,16 +1931,17 @@ impl LayeredTimeline { // and fsync them all in parallel. par_fsync::par_fsync(&[ image_layer.path(), - self.conf.timeline_path(&self.timelineid, &self.tenantid), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; // FIXME: Do we need to do something to upload it to remote storage here? let mut layers = self.layers.write().unwrap(); + let new_path = image_layer.path(); layers.insert_historic(Arc::new(image_layer)); drop(layers); - Ok(()) + Ok(new_path) } fn compact_level0(&self, target_file_size: u64) -> Result<()> { @@ -2008,8 +2031,8 @@ impl LayeredTimeline { writer = Some(DeltaLayerWriter::new( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, key, lsn_range.clone(), dictionary, @@ -2034,7 +2057,7 @@ impl LayeredTimeline { let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); // Fsync all the layer files and directory using multiple threads to // minimize latency. @@ -2044,18 +2067,38 @@ impl LayeredTimeline { } } let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); for l in new_layers { + new_layer_paths.insert(l.path()); layers.insert_historic(Arc::new(l)); } // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len()); for l in level0_deltas { l.delete()?; - layers.remove_historic(l.clone()); + if let Some(path) = l.local_path() { + layer_paths_do_delete.insert(path); + } + layers.remove_historic(l); } drop(layers); + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + new_layer_paths, + None, + ); + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_do_delete, + ); + } + Ok(()) } @@ -2108,7 +2151,7 @@ impl LayeredTimeline { let cutoff = gc_info.cutoff; let pitr = gc_info.pitr; - let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. @@ -2118,7 +2161,7 @@ impl LayeredTimeline { debug!("retain_lsns: {:?}", retain_lsns); - let mut layers_to_remove: Vec> = Vec::new(); + let mut layers_to_remove = Vec::new(); // Scan all on-disk layers in the timeline. // @@ -2229,13 +2272,24 @@ impl LayeredTimeline { // Actually delete the layers from disk and remove them from the map. // (couldn't do this in the loop above, because you cannot modify a collection // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { doomed_layer.delete()?; - layers.remove_historic(doomed_layer.clone()); - + if let Some(path) = doomed_layer.local_path() { + layer_paths_to_delete.insert(path); + } + layers.remove_historic(doomed_layer); result.layers_removed += 1; } + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_to_delete, + ); + } + result.elapsed = now.elapsed()?; Ok(result) } @@ -2299,8 +2353,8 @@ impl LayeredTimeline { if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); cache.memorize_materialized_page( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, key, last_rec_lsn, &img, @@ -2382,6 +2436,26 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } +fn load_metadata( + conf: &'static PageServerConf, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, +) -> anyhow::Result { + let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) +} + /// /// Tests that are specific to the layered storage format. /// @@ -2416,9 +2490,19 @@ pub mod tests { let err = harness.try_load().err().expect("should fail"); assert_eq!(err.to_string(), "failed to load local metadata"); - assert_eq!( - err.source().unwrap().to_string(), - "metadata checksum mismatch" + + let mut found_error_message = false; + let mut err_source = err.source(); + while let Some(source) = err_source { + if source.to_string() == "metadata checksum mismatch" { + found_error_message = true; + break; + } + err_source = source.source(); + } + assert!( + found_error_message, + "didn't find the corrupted metadata error" ); Ok(()) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index b7c4873ce3..575b1df82e 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -222,6 +222,10 @@ impl Layer for DeltaLayer { PathBuf::from(self.layer_name().to_string()) } + fn local_path(&self) -> Option { + Some(self.path()) + } + fn get_value_reconstruct_data( &self, key: Key, @@ -388,17 +392,31 @@ impl Layer for DeltaLayer { // A subroutine to dump a single blob let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { - let buf = cursor.read_blob(blob_ref.pos())?; - let val = if let Some(decompressor) = &mut decompressor { + let buf = cursor.read_blob(pos).with_context(|| { + format!( + "Failed to read blob from virtual file {}", + file.file.path.display() + ) + })?; + let decompressed_buf = if let Some(decompressor) = &mut decompressor { let decompressed_max_len = zstd::bulk::Decompressor::upper_bound(&buf) .ok_or_else(|| anyhow!("could not get decompressed length"))?; decompress_buf.clear(); decompress_buf.reserve(decompressed_max_len); let _ = decompressor.decompress_to_buffer(&buf, &mut decompress_buf); - Value::des(&decompress_buf) + &decompress_buf } else { - Value::des(&buf) - }?; + &buf + }; + + let val = Value::des(decompressed_buf) + .with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + file.file.path.display() + ) + })?; + let desc = match val { Value::Image(img) => { format!(" img {} bytes", img.len()) diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 7a9fe6f2b7..e747192d96 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -11,7 +11,6 @@ //! - page-oriented //! //! TODO: -//! - better errors (e.g. with thiserror?) //! - maybe something like an Adaptive Radix Tree would be more efficient? //! - the values stored by image and delta layers are offsets into the file, //! and they are in monotonically increasing order. Prefix compression would @@ -19,11 +18,12 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! -use anyhow; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use hex; -use std::cmp::Ordering; +use std::{cmp::Ordering, io, result}; +use thiserror::Error; +use tracing::error; use crate::layered_repository::block_io::{BlockReader, BlockWriter}; @@ -86,6 +86,23 @@ impl Value { } } +#[derive(Error, Debug)] +pub enum DiskBtreeError { + #[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)] + AppendOverflow(u64), + + #[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")] + UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> }, + + #[error("Could not push to new leaf node")] + FailedToPushToNewLeafNode, + + #[error("IoError: {0}")] + Io(#[from] io::Error), +} + +pub type Result = result::Result; + /// This is the on-disk representation. struct OnDiskNode<'a, const L: usize> { // Fixed-width fields @@ -106,12 +123,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { /// /// Interpret a PAGE_SZ page as a node. /// - fn deparse(buf: &[u8]) -> OnDiskNode { + fn deparse(buf: &[u8]) -> Result> { let mut cursor = std::io::Cursor::new(buf); - let num_children = cursor.read_u16::().unwrap(); - let level = cursor.read_u8().unwrap(); - let prefix_len = cursor.read_u8().unwrap(); - let suffix_len = cursor.read_u8().unwrap(); + let num_children = cursor.read_u16::()?; + let level = cursor.read_u8()?; + let prefix_len = cursor.read_u8()?; + let suffix_len = cursor.read_u8()?; let mut off = cursor.position(); let prefix_off = off as usize; @@ -129,7 +146,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { let keys = &buf[keys_off..keys_off + keys_len]; let values = &buf[values_off..values_off + values_len]; - OnDiskNode { + Ok(OnDiskNode { num_children, level, prefix_len, @@ -137,7 +154,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { prefix, keys, values, - } + }) } /// @@ -149,7 +166,11 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { Value::from_slice(value_slice) } - fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result { + fn binary_search( + &self, + search_key: &[u8; L], + keybuf: &mut [u8], + ) -> result::Result { let mut size = self.num_children as usize; let mut low = 0; let mut high = size; @@ -209,7 +230,7 @@ where /// /// Read the value for given key. Returns the value, or None if it doesn't exist. /// - pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result> { + pub fn get(&self, search_key: &[u8; L]) -> Result> { let mut result: Option = None; self.visit(search_key, VisitDirection::Forwards, |key, value| { if key == search_key { @@ -230,7 +251,7 @@ where search_key: &[u8; L], dir: VisitDirection, mut visitor: V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { @@ -243,7 +264,7 @@ where search_key: &[u8; L], dir: VisitDirection, visitor: &mut V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { @@ -260,11 +281,11 @@ where search_key: &[u8; L], dir: VisitDirection, visitor: &mut V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { - let node = OnDiskNode::deparse(node_buf); + let node = OnDiskNode::deparse(node_buf)?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; @@ -369,15 +390,15 @@ where } #[allow(dead_code)] - pub fn dump(&self) -> anyhow::Result<()> { + pub fn dump(&self) -> Result<()> { self.dump_recurse(self.root_blk, &[], 0) } - fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> { + fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> { let blk = self.reader.read_blk(self.start_blk + blknum)?; let buf: &[u8] = blk.as_ref(); - let node = OnDiskNode::::deparse(buf); + let node = OnDiskNode::::deparse(buf)?; print!("{:indent$}", "", indent = depth * 2); println!( @@ -442,17 +463,24 @@ where } } - pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> { - assert!(value <= MAX_VALUE); + pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> { + if value > MAX_VALUE { + return Err(DiskBtreeError::AppendOverflow(value)); + } if let Some(last_key) = &self.last_key { - assert!(key > last_key, "unsorted input"); + if key <= last_key { + return Err(DiskBtreeError::UnsortedInput { + key: key.as_slice().into(), + last_key: last_key.as_slice().into(), + }); + } } self.last_key = Some(*key); - Ok(self.append_internal(key, Value::from_u64(value))?) + self.append_internal(key, Value::from_u64(value)) } - fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> { + fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { // Try to append to the current leaf buffer let last = self.stack.last_mut().unwrap(); let level = last.level; @@ -476,14 +504,15 @@ where // key to it. let mut last = BuildNode::new(level); if !last.push(key, value) { - panic!("could not push to new leaf node"); + return Err(DiskBtreeError::FailedToPushToNewLeafNode); } + self.stack.push(last); Ok(()) } - fn flush_node(&mut self) -> Result<(), std::io::Error> { + fn flush_node(&mut self) -> Result<()> { let last = self.stack.pop().unwrap(); let buf = last.pack(); let downlink_key = last.first_key(); @@ -505,7 +534,7 @@ where /// (In the image and delta layers, it is stored in the beginning of the file, /// in the summary header) /// - pub fn finish(mut self) -> Result<(u32, W), std::io::Error> { + pub fn finish(mut self) -> Result<(u32, W)> { // flush all levels, except the root. while self.stack.len() > 1 { self.flush_node()?; @@ -692,14 +721,14 @@ mod tests { impl BlockReader for TestDisk { type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>; - fn read_blk(&self, blknum: u32) -> Result { + fn read_blk(&self, blknum: u32) -> io::Result { let mut buf = [0u8; PAGE_SZ]; buf.copy_from_slice(&self.blocks[blknum as usize]); Ok(std::rc::Rc::new(buf)) } } impl BlockWriter for &mut TestDisk { - fn write_blk(&mut self, buf: Bytes) -> Result { + fn write_blk(&mut self, buf: Bytes) -> io::Result { let blknum = self.blocks.len(); self.blocks.push(buf); Ok(blknum as u32) @@ -707,7 +736,7 @@ mod tests { } #[test] - fn basic() -> anyhow::Result<()> { + fn basic() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); @@ -788,7 +817,7 @@ mod tests { } #[test] - fn lots_of_keys() -> anyhow::Result<()> { + fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); @@ -882,7 +911,7 @@ mod tests { } #[test] - fn random_data() -> anyhow::Result<()> { + fn random_data() -> Result<()> { // Generate random keys with exponential distribution, to // exercise the prefix compression const NUM_KEYS: usize = 100000; @@ -927,21 +956,27 @@ mod tests { } #[test] - #[should_panic(expected = "unsorted input")] fn unsorted_input() { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk); let _ = writer.append(b"ba", 1); let _ = writer.append(b"bb", 2); - let _ = writer.append(b"aa", 3); + let err = writer.append(b"aa", 3).expect_err("should've failed"); + match err { + DiskBtreeError::UnsortedInput { key, last_key } => { + assert_eq!(key.as_ref(), b"aa".as_slice()); + assert_eq!(last_key.as_ref(), b"bb".as_slice()); + } + _ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"), + } } /// /// This test contains a particular data set, see disk_btree_test_data.rs /// #[test] - fn particular_data() -> anyhow::Result<()> { + fn particular_data() -> Result<()> { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 6b43806589..29b3aee625 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -126,6 +126,10 @@ impl Layer for ImageLayer { PathBuf::from(self.layer_name().to_string()) } + fn local_path(&self) -> Option { + Some(self.path()) + } + fn get_tenant_id(&self) -> ZTenantId { self.tenantid } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index a65c249fbd..20cdb5ec5d 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -86,6 +86,10 @@ impl Layer for InMemoryLayer { )) } + fn local_path(&self) -> Option { + None + } + fn get_tenant_id(&self) -> ZTenantId { self.tenantid } @@ -208,7 +212,7 @@ impl Layer for InMemoryLayer { write!(&mut desc, " img {} bytes", img.len())?; } Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec); + let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); write!( &mut desc, " rec {} bytes will_init: {} {}", diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 03ee8b8ef1..7491294c03 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -43,10 +43,13 @@ pub struct LayerMap { pub next_open_layer_at: Option, /// - /// The frozen layer, if any, contains WAL older than the current 'open_layer' - /// or 'next_open_layer_at', but newer than any historic layer. The frozen - /// layer is during checkpointing, when an InMemoryLayer is being written out - /// to disk. + /// Frozen layers, if any. Frozen layers are in-memory layers that + /// are no longer added to, but haven't been written out to disk + /// yet. They contain WAL older than the current 'open_layer' or + /// 'next_open_layer_at', but newer than any historic layer. + /// The frozen layers are in order from oldest to newest, so that + /// the newest one is in the 'back' of the VecDeque, and the oldest + /// in the 'front'. /// pub frozen_layers: VecDeque>, @@ -129,17 +132,15 @@ impl LayerMap { // this layer contains the requested point in the key/lsn space. // No need to search any further trace!( - "found layer {} for request on {} at {}", + "found layer {} for request on {key} at {end_lsn}", l.filename().display(), - key, - end_lsn ); latest_delta.replace(Arc::clone(l)); break; } // this layer's end LSN is smaller than the requested point. If there's // nothing newer, this is what we need to return. Remember this. - if let Some(ref old_candidate) = latest_delta { + if let Some(old_candidate) = &latest_delta { if l.get_lsn_range().end > old_candidate.get_lsn_range().end { latest_delta.replace(Arc::clone(l)); } @@ -149,10 +150,8 @@ impl LayerMap { } if let Some(l) = latest_delta { trace!( - "found (old) layer {} for request on {} at {}", + "found (old) layer {} for request on {key} at {end_lsn}", l.filename().display(), - key, - end_lsn ); let lsn_floor = std::cmp::max( Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), @@ -163,17 +162,13 @@ impl LayerMap { layer: l, })) } else if let Some(l) = latest_img { - trace!( - "found img layer and no deltas for request on {} at {}", - key, - end_lsn - ); + trace!("found img layer and no deltas for request on {key} at {end_lsn}"); Ok(Some(SearchResult { lsn_floor: latest_img_lsn.unwrap(), layer: l, })) } else { - trace!("no layer found for request on {} at {}", key, end_lsn); + trace!("no layer found for request on {key} at {end_lsn}"); Ok(None) } } @@ -191,7 +186,6 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// - #[allow(dead_code)] pub fn remove_historic(&mut self, layer: Arc) { let len_before = self.historic_layers.len(); @@ -250,7 +244,7 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> std::slice::Iter> { + pub fn iter_historic_layers(&self) -> impl Iterator> { self.historic_layers.iter() } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index aad631c5c4..aaf765b83d 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -105,6 +105,9 @@ pub trait Layer: Send + Sync { /// log messages, even though they're never not on disk.) fn filename(&self) -> PathBuf; + /// If a layer has a corresponding file on a local filesystem, return its absolute path. + fn local_path(&self) -> Option; + /// /// Return data needed to reconstruct given page at LSN. /// diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 94219c7840..83985069ec 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -9,8 +9,8 @@ pub mod page_service; pub mod pgdatadir_mapping; pub mod profiling; pub mod reltag; -pub mod remote_storage; pub mod repository; +pub mod storage_sync; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_threads; @@ -67,7 +67,7 @@ pub type RepositoryImpl = LayeredRepository; pub type DatadirTimelineImpl = DatadirTimeline; -pub fn shutdown_pageserver() { +pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); @@ -94,5 +94,5 @@ pub fn shutdown_pageserver() { thread_mgr::shutdown_threads(None, None, None); info!("Shut down successfully completed"); - std::process::exit(0); + std::process::exit(exit_code); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0adafab8ba..da3dedfc84 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -31,7 +31,7 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; -use crate::pgdatadir_mapping::DatadirTimeline; +use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::repository::Repository; @@ -42,12 +42,16 @@ use crate::thread_mgr::ThreadKind; use crate::walreceiver; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; +use postgres_ffi::xlog_utils::to_pg_timestamp; + +use postgres_ffi::pg_constants; // Wrapped in libpq CopyData enum PagestreamFeMessage { Exists(PagestreamExistsRequest), Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), + DbSize(PagestreamDbSizeRequest), } // Wrapped in libpq CopyData @@ -56,6 +60,7 @@ enum PagestreamBeMessage { Nblocks(PagestreamNblocksResponse), GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), + DbSize(PagestreamDbSizeResponse), } #[derive(Debug)] @@ -80,6 +85,13 @@ struct PagestreamGetPageRequest { blkno: u32, } +#[derive(Debug)] +struct PagestreamDbSizeRequest { + latest: bool, + lsn: Lsn, + dbnode: u32, +} + #[derive(Debug)] struct PagestreamExistsResponse { exists: bool, @@ -100,6 +112,11 @@ struct PagestreamErrorResponse { message: String, } +#[derive(Debug)] +struct PagestreamDbSizeResponse { + db_size: i64, +} + impl PagestreamFeMessage { fn parse(mut body: Bytes) -> anyhow::Result { // TODO these gets can fail @@ -141,6 +158,11 @@ impl PagestreamFeMessage { }, blkno: body.get_u32(), })), + 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + dbnode: body.get_u32(), + })), _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), } } @@ -171,6 +193,10 @@ impl PagestreamBeMessage { bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } + Self::DbSize(resp) => { + bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_i64(resp.db_size); + } } bytes.into() @@ -366,6 +392,11 @@ impl PageServerHandler { .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), + PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) + .observe_closure_duration(|| { + self.handle_db_size_request(timeline.as_ref(), &req) + }), }; let response = response.unwrap_or_else(|e| { @@ -486,6 +517,32 @@ impl PageServerHandler { })) } + fn handle_db_size_request( + &self, + timeline: &DatadirTimeline, + req: &PagestreamDbSizeRequest, + ) -> Result { + let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + + let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let mut total_blocks: i64 = 0; + + for rel in all_rels { + if rel.forknum == 0 { + let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0); + total_blocks += n_blocks as i64; + } + } + + let db_size = total_blocks * pg_constants::BLCKSZ as i64; + + Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { + db_size, + })) + } + fn handle_get_page_at_lsn_request( &self, timeline: &DatadirTimeline, @@ -805,6 +862,33 @@ impl postgres_backend::Handler for PageServerHandler { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("get_lsn_by_timestamp ") { + // Locate LSN of last transaction with timestamp less or equal than sppecified + // TODO lazy static + let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$") + .unwrap(); + let caps = re + .captures(query_string) + .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; + + let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) + .context("Cannot load local timeline")?; + + let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; + let timestamp_pg = to_pg_timestamp(timestamp); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"lsn", + )]))?; + let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { + LsnForTimestamp::Present(lsn) => format!("{}", lsn), + LsnForTimestamp::Future(_lsn) => "future".into(), + LsnForTimestamp::Past(_lsn) => "past".into(), + }; + pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 071eccc05d..c052aa3d69 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,6 +13,7 @@ use crate::repository::{Repository, Timeline}; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; +use postgres_ffi::xlog_utils::TimestampTz; use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; @@ -45,6 +46,13 @@ where current_logical_size: AtomicIsize, } +#[derive(Debug)] +pub enum LsnForTimestamp { + Present(Lsn), + Future(Lsn), + Past(Lsn), +} + impl DatadirTimeline { pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { @@ -202,6 +210,106 @@ impl DatadirTimeline { Ok(exists) } + /// Locate LSN, such that all transactions that committed before + /// 'search_timestamp' are visible, but nothing newer is. + /// + /// This is not exact. Commit timestamps are not guaranteed to be ordered, + /// so it's not well defined which LSN you get if there were multiple commits + /// "in flight" at that point in time. + /// + pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn(); + let min_lsn = *gc_cutoff_lsn_guard; + let max_lsn = self.tline.get_last_record_lsn(); + + // LSNs are always 8-byte aligned. low/mid/high represent the + // LSN divided by 8. + let mut low = min_lsn.0 / 8; + let mut high = max_lsn.0 / 8 + 1; + + let mut found_smaller = false; + let mut found_larger = false; + while low < high { + // cannot overflow, high and low are both smaller than u64::MAX / 2 + let mid = (high + low) / 2; + + let cmp = self.is_latest_commit_timestamp_ge_than( + search_timestamp, + Lsn(mid * 8), + &mut found_smaller, + &mut found_larger, + )?; + + if cmp { + high = mid; + } else { + low = mid + 1; + } + } + match (found_smaller, found_larger) { + (false, false) => { + // This can happen if no commit records have been processed yet, e.g. + // just after importing a cluster. + bail!("no commit timestamps found"); + } + (true, false) => { + // Didn't find any commit timestamps larger than the request + Ok(LsnForTimestamp::Future(max_lsn)) + } + (false, true) => { + // Didn't find any commit timestamps smaller than the request + Ok(LsnForTimestamp::Past(max_lsn)) + } + (true, true) => { + // low is the LSN of the first commit record *after* the search_timestamp, + // Back off by one to get to the point just before the commit. + // + // FIXME: it would be better to get the LSN of the previous commit. + // Otherwise, if you restore to the returned LSN, the database will + // include physical changes from later commits that will be marked + // as aborted, and will need to be vacuumed away. + Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + } + } + } + + /// + /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any + /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'. + /// + /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits + /// with a smaller/larger timestamp. + /// + fn is_latest_commit_timestamp_ge_than( + &self, + search_timestamp: TimestampTz, + probe_lsn: Lsn, + found_smaller: &mut bool, + found_larger: &mut bool, + ) -> Result { + for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { + let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + for blknum in (0..nblocks).rev() { + let clog_page = + self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + + if clog_page.len() == pg_constants::BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + if timestamp >= search_timestamp { + *found_larger = true; + return Ok(true); + } else { + *found_smaller = true; + } + } + } + } + Ok(false) + } + /// Get a list of SLRU segments pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs deleted file mode 100644 index cfa09dce14..0000000000 --- a/pageserver/src/remote_storage.rs +++ /dev/null @@ -1,412 +0,0 @@ -//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. -//! This particular module serves as a public API border between pageserver and the internal storage machinery. -//! No other modules from this tree are supposed to be used directly by the external code. -//! -//! There are a few components the storage machinery consists of: -//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: -//! * [`local_fs`] allows to use local file system as an external storage -//! * [`s3_bucket`] uses AWS S3 bucket as an external storage -//! -//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. -//! Synchronization internals are split into submodules -//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files -//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively -//! -//! * public API via to interact with the external world: -//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, -//! to be processed by the async loop -//! -//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: -//! -//! +------------------------+ +--------->-------+ -//! | | - - - (init async loop) - - - -> | | -//! | | | | -//! | | -------------------------------> | async | -//! | pageserver | (enqueue timeline sync task) | upload/download | -//! | | | loop | -//! | | <------------------------------- | | -//! | | (apply new timeline sync states) | | -//! +------------------------+ +---------<-------+ -//! | -//! | -//! CRUD layer file operations | -//! (upload/download/delete/list, etc.) | -//! V -//! +------------------------+ -//! | | -//! | [`RemoteStorage`] impl | -//! | | -//! | pageserver assumes it | -//! | owns exclusive write | -//! | access to this storage | -//! +------------------------+ -//! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. -//! The loop inits the storage connection and checks the remote files stored. -//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). -//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can -//! query their downloads later if they are accessed. -//! -//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. -//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. -//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. -//! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], -//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. -//! Such submissions happen in two cases: -//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future -//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory -//! -//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. -//! -//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). -//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed -//! by the storage upload, if enabled. -//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. -//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": -//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state -//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten -//! when the newer image is downloaded -//! -//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. -//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. -//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. -//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. -//! -//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. -//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, -//! when a new timeline is scheduled for the download. -//! -//! NOTES: -//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage -//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. -//! -//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. -//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. - -mod local_fs; -mod s3_bucket; -mod storage_sync; - -use std::{ - collections::{HashMap, HashSet}, - ffi, fs, - path::{Path, PathBuf}, -}; - -use anyhow::{bail, Context}; -use tokio::io; -use tracing::{debug, error, info}; - -use self::storage_sync::TEMP_DOWNLOAD_EXTENSION; -pub use self::{ - local_fs::LocalFs, - s3_bucket::S3Bucket, - storage_sync::{ - download_index_part, - index::{IndexPart, RemoteIndex, RemoteTimeline}, - schedule_timeline_checkpoint_upload, schedule_timeline_download, - }, -}; -use crate::{ - config::{PageServerConf, RemoteStorageKind}, - layered_repository::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - }, -}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; - -/// A timeline status to share with pageserver's sync counterpart, -/// after comparing local and remote timeline state. -#[derive(Clone, Copy, Debug)] -pub enum LocalTimelineInitStatus { - /// The timeline has every remote layer present locally. - /// There could be some layers requiring uploading, - /// but this does not block the timeline from any user interaction. - LocallyComplete, - /// A timeline has some files remotely, that are not present locally and need downloading. - /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, - /// so the data needs to be downloaded first before the timeline can be used. - NeedsSync, -} - -type LocalTimelineInitStatuses = HashMap>; - -/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. -/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, -/// to simplify the received code. -pub struct SyncStartupData { - pub remote_index: RemoteIndex, - pub local_timeline_init_statuses: LocalTimelineInitStatuses, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match &config.remote_storage_config { - Some(storage_config) => match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{}' as a remote storage", root.display()); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - LocalFs::new(root.clone(), &config.workdir)?, - storage_config.max_concurrent_timelines_sync, - storage_config.max_sync_errors, - ) - }, - RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - S3Bucket::new(s3_config, &config.workdir)?, - storage_config.max_concurrent_timelines_sync, - storage_config.max_sync_errors, - ) - }, - } - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - for (ZTenantTimelineId { tenant_id, timeline_id }, _) in - local_timeline_files - { - local_timeline_init_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); - } - Ok(SyncStartupData { - local_timeline_init_statuses, - remote_index: RemoteIndex::empty(), - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines = HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if entry_path.extension().and_then(ffi::OsStr::to_str) - == Some(TEMP_DOWNLOAD_EXTENSION) - { - info!("removing temp download file at {}", entry_path.display()); - fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else { - timeline_files.insert(entry_path); - } - } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didnt appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) -} - -/// Storage (potentially remote) API to manage its state. -/// This storage tries to be unaware of any layered repository context, -/// providing basic CRUD operations for storage files. -#[async_trait::async_trait] -pub trait RemoteStorage: Send + Sync { - /// A way to uniquely reference a file in the remote storage. - type StoragePath; - - /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn storage_path(&self, local_path: &Path) -> anyhow::Result; - - /// Gets the download path of the given storage file. - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result; - - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; - - /// Streams the local file contents into remote into the remote storage entry. - async fn upload( - &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - // S3 PUT request requires the content length to be specified, - // otherwise it starts to fail with the concurrent connection count increasing. - from_size_bytes: usize, - to: &Self::StoragePath, - metadata: Option, - ) -> anyhow::Result<()>; - - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. - /// Returns the metadata, if any was stored with the file previously. - async fn download( - &self, - from: &Self::StoragePath, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; - - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. - /// Returns the metadata, if any was stored with the file previously. - async fn download_range( - &self, - from: &Self::StoragePath, - start_inclusive: u64, - end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; - - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>; -} - -/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. -/// Immutable, cannot be changed once the file is created. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct StorageMetadata(HashMap); - -fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { - if prefix == path { - anyhow::bail!( - "Prefix and the path are equal, cannot strip: '{}'", - prefix.display() - ) - } else { - path.strip_prefix(prefix).with_context(|| { - format!( - "Path '{}' is not prefixed with '{}'", - path.display(), - prefix.display(), - ) - }) - } -} diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs deleted file mode 100644 index 2d3416cd32..0000000000 --- a/pageserver/src/remote_storage/storage_sync.rs +++ /dev/null @@ -1,1663 +0,0 @@ -//! A synchronization logic for the [`RemoteStorage`] and pageserver in-memory state to ensure correct synchronizations -//! between local tenant files and their counterparts from the remote storage. -//! -//! The synchronization does not aim to be immediate, yet eventually consistent. -//! Synchronization is done with the queue being emptied via separate thread asynchronously, -//! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. -//! -//! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. -//! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. -//! -//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks (size configurable). -//! A task from the batch corresponds to a single timeline, with its files to sync merged together. -//! Every batch task and layer file in the task is processed concurrently, which is possible due to incremental nature of the timelines: -//! it's not asserted, but assumed that timeline's checkpoints only add the files locally, not removing or amending the existing ones. -//! Only GC removes local timeline files, the GC support is not added to sync currently, -//! yet downloading extra files is not critically bad at this stage, GC can remove those again. -//! -//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, -//! present locally. -//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive -//! write access to remote portion of timelines that are attached to the pagegserver. -//! The index state is used to issue initial sync tasks, if needed: -//! * all timelines with local state behind the remote gets download tasks scheduled. -//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable -//! before up-to-date layers and metadata file are downloaded locally. -//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational -//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request -//! -//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. -//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely. -//! -//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside. -//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files. -//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored. -//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details. -//! -//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user. -//! New sync tasks are accepted via [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] functions, -//! disregarding of the corresponding loop startup. -//! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. -//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather -//! reschedule the same task, with possibly less files to sync: -//! * download tasks currently never replace existing local file with metadata file as an exception -//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch) -//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again -//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download -//! without accessing any extra information about its files. -//! -//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated, -//! to avoid having a corrupt state without the relevant layer files. -//! Refer to [`upload`] and [`download`] for more details. -//! -//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes). -//! NOTE: No real contents or checksum check happens right now and is a subject to improve later. -//! -//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. -//! -//! When pageserver signals shutdown, current sync task gets finished and the loop exists. - -mod download; -pub mod index; -mod upload; - -use std::{ - borrow::Cow, - collections::{HashMap, HashSet, VecDeque}, - ffi::OsStr, - fmt::Debug, - num::{NonZeroU32, NonZeroUsize}, - ops::ControlFlow, - path::{Path, PathBuf}, - sync::Arc, -}; - -use anyhow::Context; -use futures::stream::{FuturesUnordered, StreamExt}; -use lazy_static::lazy_static; -use tokio::{ - fs, - runtime::Runtime, - sync::mpsc::{self, UnboundedReceiver}, - time::{Duration, Instant}, -}; -use tracing::*; - -use self::{ - download::{download_timeline_layers, DownloadedTimeline}, - index::{IndexPart, RemoteIndex, RemoteTimeline, RemoteTimelineIndex}, - upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, -}; -use super::{LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData}; -use crate::{ - config::PageServerConf, - layered_repository::{ - metadata::{metadata_path, TimelineMetadata}, - LayeredRepository, - }, - repository::TimelineSyncStatusUpdate, - tenant_mgr::apply_timeline_sync_status_updates, - thread_mgr, - thread_mgr::ThreadKind, -}; - -use metrics::{ - register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, - IntGauge, -}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; - -pub use self::download::download_index_part; -pub use self::download::TEMP_DOWNLOAD_EXTENSION; - -lazy_static! { - static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( - "pageserver_remote_storage_remaining_sync_items", - "Number of storage sync items left in the queue" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge"); - static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( - "pageserver_remote_storage_fatal_task_failures", - "Number of critically failed tasks" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge"); - static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( - "pageserver_remote_storage_image_sync_time", - "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by `operation_kind` (upload|download) and `status` (success|failure)", - &["operation_kind", "status"], - vec![ - 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0 - ] - ) - .expect("failed to register pageserver image sync time histogram vec"); -} - -/// Wraps mpsc channel bits around into a queue interface. -/// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. -mod sync_queue { - use std::{ - collections::{hash_map, HashMap, HashSet}, - num::NonZeroUsize, - ops::ControlFlow, - sync::atomic::{AtomicUsize, Ordering}, - }; - - use anyhow::anyhow; - use once_cell::sync::OnceCell; - use tokio::sync::mpsc::{error::TryRecvError, UnboundedReceiver, UnboundedSender}; - use tracing::{debug, warn}; - - use super::SyncTask; - use utils::zid::ZTenantTimelineId; - - static SENDER: OnceCell> = OnceCell::new(); - static LENGTH: AtomicUsize = AtomicUsize::new(0); - - /// Initializes the queue with the given sender channel that is used to put the tasks into later. - /// Errors if called more than once. - pub fn init(sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>) -> anyhow::Result<()> { - SENDER - .set(sender) - .map_err(|_sender| anyhow!("sync queue was already initialized"))?; - Ok(()) - } - - /// Adds a new task to the queue, if the queue was initialized, returning `true` on success. - /// On any error, or if the queue was not initialized, the task gets dropped (not scheduled) and `false` is returned. - pub fn push(sync_id: ZTenantTimelineId, new_task: SyncTask) -> bool { - if let Some(sender) = SENDER.get() { - match sender.send((sync_id, new_task)) { - Err(e) => { - warn!("Failed to enqueue a sync task: the receiver is dropped: {e}"); - false - } - Ok(()) => { - LENGTH.fetch_add(1, Ordering::Relaxed); - true - } - } - } else { - warn!("Failed to enqueue a sync task: the sender is not initialized"); - false - } - } - - /// Polls a new task from the queue, using its receiver counterpart. - /// Does not block if the queue is empty, returning [`None`] instead. - /// Needed to correctly track the queue length. - async fn next_task( - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ) -> Option<(ZTenantTimelineId, SyncTask)> { - let task = receiver.recv().await; - if task.is_some() { - LENGTH.fetch_sub(1, Ordering::Relaxed); - } - task - } - - /// Fetches a task batch, not bigger than the given limit. - /// Not blocking, can return fewer tasks if the queue does not contain enough. - /// Batch tasks are split by timelines, with all related tasks merged into one (download/upload) - /// or two (download and upload, if both were found in the queue during batch construction). - pub async fn next_task_batch( - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - max_timelines_to_sync: NonZeroUsize, - ) -> ControlFlow<(), HashMap> { - // request the first task in blocking fashion to do less meaningless work - let (first_sync_id, first_task) = if let Some(first_task) = next_task(receiver).await { - first_task - } else { - debug!("Queue sender part was dropped, aborting"); - return ControlFlow::Break(()); - }; - - let max_timelines_to_sync = max_timelines_to_sync.get(); - let mut batched_timelines = HashSet::with_capacity(max_timelines_to_sync); - batched_timelines.insert(first_sync_id.timeline_id); - - let mut tasks = HashMap::new(); - tasks.insert(first_sync_id, first_task); - - loop { - if batched_timelines.len() >= max_timelines_to_sync { - debug!("Filled a full task batch with {max_timelines_to_sync} timeline sync operations"); - break; - } - - match receiver.try_recv() { - Ok((sync_id, new_task)) => { - LENGTH.fetch_sub(1, Ordering::Relaxed); - match tasks.entry(sync_id) { - hash_map::Entry::Occupied(o) => { - let current = o.remove(); - tasks.insert(sync_id, current.merge(new_task)); - } - hash_map::Entry::Vacant(v) => { - v.insert(new_task); - } - } - batched_timelines.insert(sync_id.timeline_id); - } - Err(TryRecvError::Disconnected) => { - debug!("Sender disconnected, batch collection aborted"); - break; - } - Err(TryRecvError::Empty) => { - debug!( - "No more data in the sync queue, task batch is not full, length: {}, max allowed size: {max_timelines_to_sync}", - batched_timelines.len() - ); - break; - } - } - } - - ControlFlow::Continue(tasks) - } - - /// Length of the queue, assuming that all receiver counterparts were only called using the queue api. - pub fn len() -> usize { - LENGTH.load(Ordering::Relaxed) - } -} - -/// A task to run in the async download/upload loop. -/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug)] -pub enum SyncTask { - /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. - /// Not necessary more fresh than the one already uploaded. - Download(SyncData), - /// A certain amount of image files to download. - Upload(SyncData), - /// Both upload and download layers need to be synced. - DownloadAndUpload(SyncData, SyncData), -} - -/// Stores the data to synd and its retries, to evict the tasks failing to frequently. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct SyncData { - retries: u32, - data: T, -} - -impl SyncData { - fn new(retries: u32, data: T) -> Self { - Self { retries, data } - } -} - -impl SyncTask { - fn download(download_task: TimelineDownload) -> Self { - Self::Download(SyncData::new(0, download_task)) - } - - fn upload(upload_task: TimelineUpload) -> Self { - Self::Upload(SyncData::new(0, upload_task)) - } - - /// Merges two tasks into one with the following rules: - /// - /// * Download + Download = Download with the retry counter reset and the layers to skip combined - /// * DownloadAndUpload + Download = DownloadAndUpload with Upload unchanged and the Download counterparts united by the same rules - /// * Upload + Upload = Upload with the retry counter reset and the layers to upload and the uploaded layers combined - /// * DownloadAndUpload + Upload = DownloadAndUpload with Download unchanged and the Upload counterparts united by the same rules - /// * Upload + Download = DownloadAndUpload with both tasks unchanged - /// * DownloadAndUpload + DownloadAndUpload = DownloadAndUpload with both parts united by the same rules - fn merge(mut self, other: Self) -> Self { - match (&mut self, other) { - ( - SyncTask::DownloadAndUpload(download_data, _) | SyncTask::Download(download_data), - SyncTask::Download(new_download_data), - ) - | ( - SyncTask::Download(download_data), - SyncTask::DownloadAndUpload(new_download_data, _), - ) => { - download_data - .data - .layers_to_skip - .extend(new_download_data.data.layers_to_skip.into_iter()); - download_data.retries = 0; - } - (SyncTask::Upload(upload), SyncTask::Download(new_download_data)) => { - self = SyncTask::DownloadAndUpload(new_download_data, upload.clone()); - } - - ( - SyncTask::DownloadAndUpload(_, upload_data) | SyncTask::Upload(upload_data), - SyncTask::Upload(new_upload_data), - ) - | (SyncTask::Upload(upload_data), SyncTask::DownloadAndUpload(_, new_upload_data)) => { - upload_data - .data - .layers_to_upload - .extend(new_upload_data.data.layers_to_upload.into_iter()); - upload_data - .data - .uploaded_layers - .extend(new_upload_data.data.uploaded_layers.into_iter()); - upload_data.retries = 0; - - if new_upload_data.data.metadata.disk_consistent_lsn() - > upload_data.data.metadata.disk_consistent_lsn() - { - upload_data.data.metadata = new_upload_data.data.metadata; - } - } - (SyncTask::Download(download), SyncTask::Upload(new_upload_data)) => { - self = SyncTask::DownloadAndUpload(download.clone(), new_upload_data) - } - - ( - SyncTask::DownloadAndUpload(download_data, upload_data), - SyncTask::DownloadAndUpload(new_download_data, new_upload_data), - ) => { - download_data - .data - .layers_to_skip - .extend(new_download_data.data.layers_to_skip.into_iter()); - download_data.retries = 0; - - upload_data - .data - .layers_to_upload - .extend(new_upload_data.data.layers_to_upload.into_iter()); - upload_data - .data - .uploaded_layers - .extend(new_upload_data.data.uploaded_layers.into_iter()); - upload_data.retries = 0; - - if new_upload_data.data.metadata.disk_consistent_lsn() - > upload_data.data.metadata.disk_consistent_lsn() - { - upload_data.data.metadata = new_upload_data.data.metadata; - } - } - } - - self - } - - fn name(&self) -> &'static str { - match self { - SyncTask::Download(_) => "download", - SyncTask::Upload(_) => "upload", - SyncTask::DownloadAndUpload(_, _) => "download and upload", - } - } - - fn retries(&self) -> u32 { - match self { - SyncTask::Download(data) => data.retries, - SyncTask::Upload(data) => data.retries, - SyncTask::DownloadAndUpload(download_data, upload_data) => { - download_data.retries.max(upload_data.retries) - } - } - } -} - -/// Local timeline files for upload, appeared after the new checkpoint. -/// Current checkpoint design assumes new files are added only, no deletions or amendment happens. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineUpload { - /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. - layers_to_upload: HashSet, - /// Already uploaded layers. Used to store the data about the uploads between task retries - /// and to record the data into the remote index after the task got completed or evicted. - uploaded_layers: HashSet, - metadata: TimelineMetadata, -} - -/// A timeline download task. -/// Does not contain the file list to download, to allow other -/// parts of the pageserer code to schedule the task -/// without using the remote index or any other ways to list the remote timleine files. -/// Skips the files that are already downloaded. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineDownload { - layers_to_skip: HashSet, -} - -/// Adds the new checkpoint files as an upload sync task to the queue. -/// On task failure, it gets retried again from the start a number of times. -/// -/// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_checkpoint_upload( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - new_layer: PathBuf, - metadata: TimelineMetadata, -) { - if !sync_queue::push( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - SyncTask::upload(TimelineUpload { - layers_to_upload: HashSet::from([new_layer]), - uploaded_layers: HashSet::new(), - metadata, - }), - ) { - warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}",) - } else { - debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") - } -} - -/// Requests the download of the entire timeline for a given tenant. -/// No existing local files are currently overwritten, except the metadata file (if its disk_consistent_lsn is less than the downloaded one). -/// The metadata file is always updated last, to avoid inconsistencies. -/// -/// On any failure, the task gets retried, omitting already downloaded layers. -/// -/// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { - debug!("Scheduling timeline download for tenant {tenant_id}, timeline {timeline_id}"); - sync_queue::push( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - SyncTask::download(TimelineDownload { - layers_to_skip: HashSet::new(), - }), - ); -} - -/// Uses a remote storage given to start the storage sync loop. -/// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( - conf: &'static PageServerConf, - local_timeline_files: HashMap)>, - storage: S, - max_concurrent_timelines_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) -> anyhow::Result -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let (sender, receiver) = mpsc::unbounded_channel(); - sync_queue::init(sender)?; - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - - let applicable_index_parts = runtime.block_on(try_fetch_index_parts( - conf, - &storage, - local_timeline_files.keys().copied().collect(), - )); - - let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; - - let local_timeline_init_statuses = schedule_first_sync_tasks( - &mut runtime.block_on(remote_index.write()), - local_timeline_files, - ); - - let loop_index = remote_index.clone(); - thread_mgr::spawn( - ThreadKind::StorageSync, - None, - None, - "Remote storage sync thread", - false, - move || { - storage_sync_loop( - runtime, - conf, - receiver, - Arc::new(storage), - loop_index, - max_concurrent_timelines_sync, - max_sync_errors, - ); - Ok(()) - }, - ) - .context("Failed to spawn remote storage sync thread")?; - Ok(SyncStartupData { - remote_index, - local_timeline_init_statuses, - }) -} - -#[allow(clippy::too_many_arguments)] -fn storage_sync_loop( - runtime: Runtime, - conf: &'static PageServerConf, - mut receiver: UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - storage: Arc, - index: RemoteIndex, - max_concurrent_timelines_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - info!("Starting remote storage sync loop"); - loop { - let loop_index = index.clone(); - let storage = Arc::clone(&storage); - let loop_step = runtime.block_on(async { - tokio::select! { - step = loop_step( - conf, - &mut receiver, - storage, - loop_index, - max_concurrent_timelines_sync, - max_sync_errors, - ) - .instrument(info_span!("storage_sync_loop_step")) => step, - _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), - } - }); - - match loop_step { - ControlFlow::Continue(new_timeline_states) => { - if new_timeline_states.is_empty() { - debug!("Sync loop step completed, no new timeline states"); - } else { - info!( - "Sync loop step completed, {} new timeline state update(s)", - new_timeline_states.len() - ); - // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates(conf, &index, new_timeline_states); - } - } - ControlFlow::Break(()) => { - info!("Shutdown requested, stopping"); - break; - } - } - } -} - -async fn loop_step( - conf: &'static PageServerConf, - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - storage: Arc, - index: RemoteIndex, - max_concurrent_timelines_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) -> ControlFlow<(), HashMap>> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let batched_tasks = - match sync_queue::next_task_batch(receiver, max_concurrent_timelines_sync).await { - ControlFlow::Continue(batch) => batch, - ControlFlow::Break(()) => return ControlFlow::Break(()), - }; - - let remaining_queue_length = sync_queue::len(); - REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); - if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); - } else { - debug!("No tasks to process"); - return ControlFlow::Continue(HashMap::new()); - } - - let mut sync_results = batched_tasks - .into_iter() - .map(|(sync_id, task)| { - let storage = Arc::clone(&storage); - let index = index.clone(); - async move { - let state_update = - process_sync_task(conf, storage, index, max_sync_errors, sync_id, task) - .instrument(info_span!("process_sync_tasks", sync_id = %sync_id)) - .await; - (sync_id, state_update) - } - }) - .collect::>(); - - let mut new_timeline_states: HashMap< - ZTenantId, - HashMap, - > = HashMap::with_capacity(max_concurrent_timelines_sync.get()); - while let Some((sync_id, state_update)) = sync_results.next().await { - debug!("Finished storage sync task for sync id {sync_id}"); - if let Some(state_update) = state_update { - new_timeline_states - .entry(sync_id.tenant_id) - .or_default() - .insert(sync_id.timeline_id, state_update); - } - } - - ControlFlow::Continue(new_timeline_states) -} - -async fn process_sync_task( - conf: &'static PageServerConf, - storage: Arc, - index: RemoteIndex, - max_sync_errors: NonZeroU32, - sync_id: ZTenantTimelineId, - task: SyncTask, -) -> Option -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let sync_start = Instant::now(); - let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; - - let task = match validate_task_retries(sync_id, task, max_sync_errors) { - ControlFlow::Continue(task) => task, - ControlFlow::Break(aborted_task) => { - match aborted_task { - SyncTask::Download(_) => { - index - .write() - .await - .set_awaits_download(&sync_id, false) - .ok(); - } - SyncTask::Upload(failed_upload_data) => { - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - &failed_upload_data.data, - true, - ) - .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); - } - } - SyncTask::DownloadAndUpload(_, failed_upload_data) => { - index - .write() - .await - .set_awaits_download(&sync_id, false) - .ok(); - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - &failed_upload_data.data, - true, - ) - .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); - } - } - } - return None; - } - }; - - let task_name = task.name(); - let current_task_attempt = task.retries(); - info!("Sync task '{task_name}' processing started, attempt #{current_task_attempt}"); - - if current_task_attempt > 0 { - let seconds_to_wait = 2.0_f64.powf(current_task_attempt as f64 - 1.0).min(30.0); - info!("Waiting {seconds_to_wait} seconds before starting the '{task_name}' task"); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; - } - - let status_update = match task { - SyncTask::Download(new_download_data) => { - download_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_download_data, - sync_start, - task_name, - ) - .await - } - SyncTask::Upload(new_upload_data) => { - upload_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_upload_data, - sync_start, - task_name, - ) - .await; - None - } - SyncTask::DownloadAndUpload(new_download_data, new_upload_data) => { - let status_update = download_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_download_data, - sync_start, - task_name, - ) - .await; - - upload_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_upload_data, - sync_start, - task_name, - ) - .await; - - status_update - } - }; - - info!("Finished processing the task"); - - status_update -} - -async fn download_timeline( - conf: &'static PageServerConf, - (storage, index): (&S, &RemoteIndex), - current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, - new_download_data: SyncData, - sync_start: Instant, - task_name: &str, -) -> Option -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - match download_timeline_layers( - conf, - storage, - current_remote_timeline, - sync_id, - new_download_data, - ) - .await - { - DownloadedTimeline::Abort => { - register_sync_status(sync_start, task_name, None); - if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { - error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); - } - None - } - DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); - None - } - DownloadedTimeline::Successful(mut download_data) => { - match update_local_metadata(conf, sync_id, current_remote_timeline).await { - Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { - Ok(()) => { - register_sync_status(sync_start, task_name, Some(true)); - Some(TimelineSyncStatusUpdate::Downloaded) - } - Err(e) => { - error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); - None - } - }, - Err(e) => { - error!("Failed to update local timeline metadata: {e:?}"); - download_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_start, task_name, Some(false)); - None - } - } - } - } -} - -async fn update_local_metadata( - conf: &'static PageServerConf, - sync_id: ZTenantTimelineId, - remote_timeline: Option<&RemoteTimeline>, -) -> anyhow::Result<()> { - let remote_metadata = match remote_timeline { - Some(timeline) => &timeline.metadata, - None => { - info!("No remote timeline to update local metadata from, skipping the update"); - return Ok(()); - } - }; - let remote_lsn = remote_metadata.disk_consistent_lsn(); - - let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); - let local_lsn = if local_metadata_path.exists() { - let local_metadata = read_metadata_file(&local_metadata_path) - .await - .with_context(|| { - format!( - "Failed to load local metadata from path '{}'", - local_metadata_path.display() - ) - })?; - - Some(local_metadata.disk_consistent_lsn()) - } else { - None - }; - - if local_lsn < Some(remote_lsn) { - info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); - // clone because spawn_blocking requires static lifetime - let cloned_metadata = remote_metadata.to_owned(); - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - tokio::task::spawn_blocking(move || { - LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) - }) - .await - .with_context(|| { - format!( - "failed to join save_metadata task for {}", - local_metadata_path.display() - ) - })? - .with_context(|| { - format!( - "Failed to write remote metadata bytes locally to path '{}'", - local_metadata_path.display() - ) - })?; - } else { - info!("Local metadata at path '{}' has later disk consistent Lsn ({local_lsn:?}) than the remote one ({remote_lsn}), skipping the update", local_metadata_path.display()); - } - - Ok(()) -} - -async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { - TimelineMetadata::from_bytes( - &fs::read(metadata_path) - .await - .context("Failed to read local metadata bytes from fs")?, - ) - .context("Failed to parse metadata bytes") -} - -async fn upload_timeline( - conf: &'static PageServerConf, - (storage, index): (&S, &RemoteIndex), - current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, - new_upload_data: SyncData, - sync_start: Instant, - task_name: &str, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let mut uploaded_data = - match upload_timeline_layers(storage, current_remote_timeline, sync_id, new_upload_data) - .await - { - UploadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); - return; - } - UploadedTimeline::Successful(upload_data) => upload_data, - UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { - let local_metadata_path = - metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); - let local_metadata = match read_metadata_file(&local_metadata_path).await { - Ok(metadata) => metadata, - Err(e) => { - error!( - "Failed to load local metadata from path '{}': {e:?}", - local_metadata_path.display() - ); - outdated_upload_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); - register_sync_status(sync_start, task_name, Some(false)); - return; - } - }; - - outdated_upload_data.data.metadata = local_metadata; - outdated_upload_data - } - }; - - match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { - Ok(()) => register_sync_status(sync_start, task_name, Some(true)), - Err(e) => { - error!("Failed to update remote timeline {sync_id}: {e:?}"); - uploaded_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_start, task_name, Some(false)); - } - } -} - -async fn update_remote_data( - conf: &'static PageServerConf, - storage: &S, - index: &RemoteIndex, - sync_id: ZTenantTimelineId, - uploaded_data: &TimelineUpload, - upload_failed: bool, -) -> anyhow::Result<()> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - info!("Updating remote index for the timeline"); - let updated_remote_timeline = { - let mut index_accessor = index.write().await; - - match index_accessor.timeline_entry_mut(&sync_id) { - Some(existing_entry) => { - if existing_entry.metadata.disk_consistent_lsn() - < uploaded_data.metadata.disk_consistent_lsn() - { - existing_entry.metadata = uploaded_data.metadata.clone(); - } - if upload_failed { - existing_entry - .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); - } else { - existing_entry - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); - } - existing_entry.clone() - } - None => { - let mut new_remote_timeline = RemoteTimeline::new(uploaded_data.metadata.clone()); - if upload_failed { - new_remote_timeline - .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); - } else { - new_remote_timeline - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); - } - - index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); - new_remote_timeline - } - } - }; - - let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - let new_index_part = - IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) - .context("Failed to create an index part from the updated remote timeline")?; - - info!("Uploading remote data for the timeline"); - upload_index_part(conf, storage, sync_id, new_index_part) - .await - .context("Failed to upload new index part") -} - -fn validate_task_retries( - sync_id: ZTenantTimelineId, - task: SyncTask, - max_sync_errors: NonZeroU32, -) -> ControlFlow { - let max_sync_errors = max_sync_errors.get(); - let mut skip_upload = false; - let mut skip_download = false; - - match &task { - SyncTask::Download(download_data) | SyncTask::DownloadAndUpload(download_data, _) - if download_data.retries > max_sync_errors => - { - error!( - "Evicting download task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", - download_data.retries - ); - skip_download = true; - } - SyncTask::Upload(upload_data) | SyncTask::DownloadAndUpload(_, upload_data) - if upload_data.retries > max_sync_errors => - { - error!( - "Evicting upload task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", - upload_data.retries, - ); - skip_upload = true; - } - _ => {} - } - - match task { - aborted_task @ SyncTask::Download(_) if skip_download => ControlFlow::Break(aborted_task), - aborted_task @ SyncTask::Upload(_) if skip_upload => ControlFlow::Break(aborted_task), - aborted_task @ SyncTask::DownloadAndUpload(_, _) if skip_upload && skip_download => { - ControlFlow::Break(aborted_task) - } - SyncTask::DownloadAndUpload(download_task, _) if skip_upload => { - ControlFlow::Continue(SyncTask::Download(download_task)) - } - SyncTask::DownloadAndUpload(_, upload_task) if skip_download => { - ControlFlow::Continue(SyncTask::Upload(upload_task)) - } - not_skipped => ControlFlow::Continue(not_skipped), - } -} - -async fn try_fetch_index_parts( - conf: &'static PageServerConf, - storage: &S, - keys: HashSet, -) -> HashMap -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let mut index_parts = HashMap::with_capacity(keys.len()); - - let mut part_downloads = keys - .into_iter() - .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) - .collect::>(); - - while let Some((id, part_upload_result)) = part_downloads.next().await { - match part_upload_result { - Ok(index_part) => { - debug!("Successfully fetched index part for {id}"); - index_parts.insert(id, index_part); - } - Err(e) => warn!("Failed to fetch index part for {id}: {e}"), - } - } - - index_parts -} - -fn schedule_first_sync_tasks( - index: &mut RemoteTimelineIndex, - local_timeline_files: HashMap)>, -) -> LocalTimelineInitStatuses { - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - - let mut new_sync_tasks = - VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); - - for (sync_id, (local_metadata, local_files)) in local_timeline_files { - match index.timeline_entry_mut(&sync_id) { - Some(remote_timeline) => { - let (timeline_status, awaits_download) = compare_local_and_remote_timeline( - &mut new_sync_tasks, - sync_id, - local_metadata, - local_files, - remote_timeline, - ); - let was_there = local_timeline_init_statuses - .entry(sync_id.tenant_id) - .or_default() - .insert(sync_id.timeline_id, timeline_status); - - if was_there.is_some() { - // defensive check - warn!( - "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", - sync_id.timeline_id - ); - } - remote_timeline.awaits_download = awaits_download; - } - None => { - // TODO (rodionov) does this mean that we've crashed during tenant creation? - // is it safe to upload this checkpoint? could it be half broken? - new_sync_tasks.push_back(( - sync_id, - SyncTask::upload(TimelineUpload { - layers_to_upload: local_files, - uploaded_layers: HashSet::new(), - metadata: local_metadata, - }), - )); - local_timeline_init_statuses - .entry(sync_id.tenant_id) - .or_default() - .insert( - sync_id.timeline_id, - LocalTimelineInitStatus::LocallyComplete, - ); - } - } - } - - new_sync_tasks.into_iter().for_each(|(sync_id, task)| { - sync_queue::push(sync_id, task); - }); - local_timeline_init_statuses -} - -fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, - sync_id: ZTenantTimelineId, - local_metadata: TimelineMetadata, - local_files: HashSet, - remote_entry: &RemoteTimeline, -) -> (LocalTimelineInitStatus, bool) { - let remote_files = remote_entry.stored_files(); - - // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download whats there? - // without trying to upload something. It may be tricky, needs further investigation. - // For now looks strange that we can request upload - // and dowload for the same timeline simultaneously. - // (upload needs to be only for previously unsynced files, not whole timeline dir). - // If one of the tasks fails they will be reordered in the queue which can lead - // to timeline being stuck in evicted state - let number_of_layers_to_download = remote_files.difference(&local_files).count(); - let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { - new_sync_tasks.push_back(( - sync_id, - SyncTask::download(TimelineDownload { - layers_to_skip: local_files.clone(), - }), - )); - (LocalTimelineInitStatus::NeedsSync, true) - // we do not need to manupulate with remote consistent lsn here - // because it will be updated when sync will be completed - } else { - (LocalTimelineInitStatus::LocallyComplete, false) - }; - - let layers_to_upload = local_files - .difference(remote_files) - .cloned() - .collect::>(); - if !layers_to_upload.is_empty() { - new_sync_tasks.push_back(( - sync_id, - SyncTask::upload(TimelineUpload { - layers_to_upload, - uploaded_layers: HashSet::new(), - metadata: local_metadata, - }), - )); - // Note that status here doesn't change. - } - - (initial_timeline_status, awaits_download) -} - -fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { - let secs_elapsed = sync_start.elapsed().as_secs_f64(); - info!("Processed a sync task in {secs_elapsed:.2} seconds"); - match sync_status { - Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), - Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), - None => return, - } - .observe(secs_elapsed) -} - -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - -#[cfg(test)] -mod test_utils { - use utils::lsn::Lsn; - - use crate::repository::repo_harness::RepoHarness; - - use super::*; - - pub async fn create_local_timeline( - harness: &RepoHarness<'_>, - timeline_id: ZTimelineId, - filenames: &[&str], - metadata: TimelineMetadata, - ) -> anyhow::Result { - let timeline_path = harness.timeline_path(&timeline_id); - fs::create_dir_all(&timeline_path).await?; - - let mut layers_to_upload = HashSet::with_capacity(filenames.len()); - for &file in filenames { - let file_path = timeline_path.join(file); - fs::write(&file_path, dummy_contents(file).into_bytes()).await?; - layers_to_upload.insert(file_path); - } - - fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), - metadata.to_bytes()?, - ) - .await?; - - Ok(TimelineUpload { - layers_to_upload, - uploaded_layers: HashSet::new(), - metadata, - }) - } - - pub fn dummy_contents(name: &str) -> String { - format!("contents for {name}") - } - - pub fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) - } -} - -#[cfg(test)] -mod tests { - use std::collections::BTreeSet; - - use super::{test_utils::dummy_metadata, *}; - use utils::lsn::Lsn; - - #[test] - fn download_sync_tasks_merge() { - let download_1 = SyncTask::Download(SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - )); - let download_2 = SyncTask::Download(SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - )); - - let merged_download = match download_1.merge(download_2) { - SyncTask::Download(merged_download) => merged_download, - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - } - - #[test] - fn upload_sync_tasks_merge() { - let metadata_1 = dummy_metadata(Lsn(1)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); - - let upload_1 = SyncTask::Upload(SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1, - }, - )); - let upload_2 = SyncTask::Upload(SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2.clone(), - }, - )); - - let merged_upload = match upload_1.merge(upload_2) { - SyncTask::Upload(merged_upload) => merged_upload, - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, metadata_2, - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } - - #[test] - fn upload_and_download_sync_tasks_merge() { - let download_data = SyncData::new( - 3, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("d_one")]), - }, - ); - - let upload_data = SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("u_one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one_2")]), - metadata: dummy_metadata(Lsn(1)), - }, - ); - - let (merged_download, merged_upload) = match SyncTask::Download(download_data.clone()) - .merge(SyncTask::Upload(upload_data.clone())) - { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download, download_data, - "When upload and dowload are merged, both should be unchanged" - ); - assert_eq!( - merged_upload, upload_data, - "When upload and dowload are merged, both should be unchanged" - ); - } - - #[test] - fn uploaddownload_and_upload_sync_tasks_merge() { - let download_data = SyncData::new( - 3, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("d_one")]), - }, - ); - - let metadata_1 = dummy_metadata(Lsn(5)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_1.disk_consistent_lsn() > metadata_2.disk_consistent_lsn()); - - let upload_download = SyncTask::DownloadAndUpload( - download_data.clone(), - SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1.clone(), - }, - ), - ); - - let new_upload = SyncTask::Upload(SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2, - }, - )); - - let (merged_download, merged_upload) = match upload_download.merge(new_upload) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download, download_data, - "When uploaddowload and upload tasks are merged, download should be unchanged" - ); - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, metadata_1, - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } - - #[test] - fn uploaddownload_and_download_sync_tasks_merge() { - let upload_data = SyncData::new( - 22, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: dummy_metadata(Lsn(22)), - }, - ); - - let upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - ), - upload_data.clone(), - ); - - let new_download = SyncTask::Download(SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - )); - - let (merged_download, merged_upload) = match upload_download.merge(new_download) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_upload, upload_data, - "When uploaddowload and download tasks are merged, upload should be unchanged" - ); - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - } - - #[test] - fn uploaddownload_sync_tasks_merge() { - let metadata_1 = dummy_metadata(Lsn(1)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); - - let upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - ), - SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1, - }, - ), - ); - let new_upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - ), - SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2.clone(), - }, - ), - ); - - let (merged_download, merged_upload) = match upload_download.merge(new_upload_download) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, metadata_2, - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } - - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - } -} diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs new file mode 100644 index 0000000000..00e7c85e35 --- /dev/null +++ b/pageserver/src/remote_storage/storage_sync/delete.rs @@ -0,0 +1,223 @@ +//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tracing::{debug, error, info}; +use utils::zid::ZTenantTimelineId; + +use crate::remote_storage::{ + storage_sync::{SyncQueue, SyncTask}, + RemoteStorage, +}; + +use super::{LayersDeletion, SyncData}; + +/// Attempts to remove the timleline layers from the remote storage. +/// If the task had not adjusted the metadata before, the deletion will fail. +pub(super) async fn delete_timeline_layers<'a, P, S>( + storage: &'a S, + sync_queue: &SyncQueue, + sync_id: ZTenantTimelineId, + mut delete_data: SyncData, +) -> bool +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + if !delete_data.data.deletion_registered { + error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + return false; + } + + if delete_data.data.layers_to_delete.is_empty() { + info!("No layers to delete, skipping"); + return true; + } + + let layers_to_delete = delete_data + .data + .layers_to_delete + .drain() + .collect::>(); + debug!("Layers to delete: {layers_to_delete:?}"); + info!("Deleting {} timeline layers", layers_to_delete.len()); + + let mut delete_tasks = layers_to_delete + .into_iter() + .map(|local_layer_path| async { + let storage_path = match storage.storage_path(&local_layer_path).with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + }) { + Ok(path) => path, + Err(e) => return Err((e, local_layer_path)), + }; + + match storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) { + Ok(()) => Ok(local_layer_path), + Err(e) => Err((e, local_layer_path)), + } + }) + .collect::>(); + + let mut errored = false; + while let Some(deletion_result) = delete_tasks.next().await { + match deletion_result { + Ok(local_layer_path) => { + debug!( + "Successfully deleted layer {} for timeline {sync_id}", + local_layer_path.display() + ); + delete_data.data.deleted_layers.insert(local_layer_path); + } + Err((e, local_layer_path)) => { + errored = true; + error!( + "Failed to delete layer {} for timeline {sync_id}: {e:?}", + local_layer_path.display() + ); + delete_data.data.layers_to_delete.insert(local_layer_path); + } + } + } + + if errored { + debug!("Reenqueuing failed delete task for timeline {sync_id}"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } + errored +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, num::NonZeroUsize}; + + use itertools::Itertools; + use tempfile::tempdir; + use tokio::fs; + use utils::lsn::Lsn; + + use crate::{ + remote_storage::{ + storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + LocalFs, + }, + repository::repo_harness::{RepoHarness, TIMELINE_ID}, + }; + + use super::*; + + #[tokio::test] + async fn delete_timeline_negative() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline_negative")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: 1, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::new(), + deletion_registered: false, + }, + }, + ) + .await; + + assert!( + !deleted, + "Should not start the deletion for task with delete metadata unregistered" + ); + + Ok(()) + } + + #[tokio::test] + async fn delete_timeline() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "c", "d"]; + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + for local_path in timeline_upload.layers_to_upload { + let remote_path = storage.storage_path(&local_path)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + layer_files + .iter() + .map(|layer_str| layer_str.to_string()) + .sorted() + .collect::>(), + "Expect to have all layer files remotely before deletion" + ); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: current_retries, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::from([ + local_timeline_path.join("a"), + local_timeline_path.join("c"), + local_timeline_path.join("something_different"), + ]), + deletion_registered: true, + }, + }, + ) + .await; + assert!(deleted, "Should be able to delete timeline files"); + + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + vec!["b".to_string(), "d".to_string()], + "Expect to have only non-deleted files remotely" + ); + + Ok(()) + } +} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5044f2bfc5..d25dc8914d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,5 +1,5 @@ use crate::layered_repository::metadata::TimelineMetadata; -use crate::remote_storage::RemoteIndex; +use crate::storage_sync::index::RemoteIndex; use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; use anyhow::{bail, Result}; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs new file mode 100644 index 0000000000..b8c6f7fdab --- /dev/null +++ b/pageserver/src/storage_sync.rs @@ -0,0 +1,1864 @@ +//! There are a few components the storage machinery consists of: +//! +//! * [`RemoteStorage`] that is used to interact with an arbitrary external storage +//! +//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. +//! Synchronization internals are split into submodules +//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files +//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively +//! +//! * public API via to interact with the external world: +//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization +//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, +//! to be processed by the async loop +//! +//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: +//! +//! +------------------------+ +--------->-------+ +//! | | - - - (init async loop) - - - -> | | +//! | | | | +//! | | -------------------------------> | async | +//! | pageserver | (enqueue timeline sync task) | upload/download | +//! | | | loop | +//! | | <------------------------------- | | +//! | | (apply new timeline sync states) | | +//! +------------------------+ +---------<-------+ +//! | +//! | +//! CRUD layer file operations | +//! (upload/download/delete/list, etc.) | +//! V +//! +------------------------+ +//! | | +//! | [`RemoteStorage`] impl | +//! | | +//! | pageserver assumes it | +//! | owns exclusive write | +//! | access to this storage | +//! +------------------------+ +//! +//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! The loop inits the storage connection and checks the remote files stored. +//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). +//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can +//! query their downloads later if they are accessed. +//! +//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. +//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. +//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). +//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. +//! +//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], +//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Such submissions happen in two cases: +//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future +//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory +//! +//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. +//! +//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). +//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed +//! by the storage upload, if enabled. +//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. +//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": +//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state +//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten +//! when the newer image is downloaded +//! +//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! +//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. +//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! when a new timeline is scheduled for the download. +//! +//! NOTES: +//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage +//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. +//! +//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. +//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. +//! +//! A synchronization logic for the [`RemoteStorage`] and pageserver in-memory state to ensure correct synchronizations +//! between local tenant files and their counterparts from the remote storage. +//! +//! The synchronization does not aim to be immediate, yet eventually consistent. +//! Synchronization is done with the queue being emptied via separate thread asynchronously, +//! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. +//! +//! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. +//! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. +//! +//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks. +//! A task from the batch corresponds to a single timeline, with its files to sync merged together: given that only one task sync loop step is active at a time, +//! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. +//! Deletion happens only after a successful upload only, otherwise the compation output might make the timeline inconsistent until both tasks are fully processed without errors. +//! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task +//! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. +//! +//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, +//! present locally. +//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive +//! write access to remote portion of timelines that are attached to the pagegserver. +//! The index state is used to issue initial sync tasks, if needed: +//! * all timelines with local state behind the remote gets download tasks scheduled. +//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable +//! before up-to-date layers and metadata file are downloaded locally. +//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational +//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request +//! +//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. +//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely. +//! +//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside. +//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files. +//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored. +//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details. +//! +//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user. +//! New sync tasks are accepted via [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] functions, +//! disregarding of the corresponding loop startup. +//! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. +//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather +//! reschedule the same task, with possibly less files to sync: +//! * download tasks currently never replace existing local file with metadata file as an exception +//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch) +//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again +//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download +//! without accessing any extra information about its files. +//! +//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated, +//! to avoid having a corrupt state without the relevant layer files. +//! Refer to [`upload`] and [`download`] for more details. +//! +//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes). +//! NOTE: No real contents or checksum check happens right now and is a subject to improve later. +//! +//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. +//! +//! When pageserver signals shutdown, current sync task gets finished and the loop exists. + +mod delete; +mod download; +pub mod index; +mod upload; + +use std::{ + collections::{hash_map, HashMap, HashSet, VecDeque}, + ffi::OsStr, + fmt::Debug, + num::{NonZeroU32, NonZeroUsize}, + ops::ControlFlow, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +use anyhow::{anyhow, bail, Context}; +use futures::stream::{FuturesUnordered, StreamExt}; +use lazy_static::lazy_static; +use once_cell::sync::OnceCell; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use tokio::{ + fs, + runtime::Runtime, + sync::mpsc::{self, error::TryRecvError, UnboundedReceiver, UnboundedSender}, + time::{Duration, Instant}, +}; +use tracing::*; + +use self::{ + delete::delete_timeline_layers, + download::{download_timeline_layers, DownloadedTimeline}, + index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, + upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, +}; +use crate::{ + config::PageServerConf, + layered_repository::{ + ephemeral_file::is_ephemeral_file, + metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + LayeredRepository, + }, + repository::TimelineSyncStatusUpdate, + storage_sync::{self, index::RemoteIndex}, + tenant_mgr::apply_timeline_sync_status_updates, + thread_mgr, + thread_mgr::ThreadKind, +}; + +use metrics::{ + register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, + IntGauge, +}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; + +pub use self::download::download_index_part; +pub use self::download::TEMP_DOWNLOAD_EXTENSION; + +lazy_static! { + static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( + "pageserver_remote_storage_remaining_sync_items", + "Number of storage sync items left in the queue" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge"); + static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( + "pageserver_remote_storage_fatal_task_failures", + "Number of critically failed tasks" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge"); + static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( + "pageserver_remote_storage_image_sync_time", + "Time took to synchronize (download or upload) a whole pageserver image. \ + Grouped by `operation_kind` (upload|download) and `status` (success|failure)", + &["operation_kind", "status"], + vec![ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0 + ] + ) + .expect("failed to register pageserver image sync time histogram vec"); +} + +static SYNC_QUEUE: OnceCell = OnceCell::new(); + +/// A timeline status to share with pageserver's sync counterpart, +/// after comparing local and remote timeline state. +#[derive(Clone, Copy, Debug)] +pub enum LocalTimelineInitStatus { + /// The timeline has every remote layer present locally. + /// There could be some layers requiring uploading, + /// but this does not block the timeline from any user interaction. + LocallyComplete, + /// A timeline has some files remotely, that are not present locally and need downloading. + /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, + /// so the data needs to be downloaded first before the timeline can be used. + NeedsSync, +} + +type LocalTimelineInitStatuses = HashMap>; + +/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. +/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, +/// to simplify the received code. +pub struct SyncStartupData { + pub remote_index: RemoteIndex, + pub local_timeline_init_statuses: LocalTimelineInitStatuses, +} + +/// Based on the config, initiates the remote storage connection and starts a separate thread +/// that ensures that pageserver and the remote storage are in sync with each other. +/// If no external configuration connection given, no thread or storage initialization is done. +/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. +pub fn start_local_timeline_sync( + config: &'static PageServerConf, +) -> anyhow::Result { + let local_timeline_files = local_tenant_timeline_files(config) + .context("Failed to collect local tenant timeline files")?; + + match config.remote_storage_config.as_ref() { + Some(storage_config) => { + match GenericRemoteStorage::new(config.workdir.clone(), storage_config) + .context("Failed to init the generic remote storage")? + { + GenericRemoteStorage::Local(local_fs_storage) => { + storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + local_fs_storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + } + GenericRemoteStorage::S3(s3_bucket_storage) => { + storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + s3_bucket_storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + } + } + .context("Failed to spawn the storage sync thread") + } + None => { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); + for ( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + _, + ) in local_timeline_files + { + local_timeline_init_statuses + .entry(tenant_id) + .or_default() + .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); + } + Ok(SyncStartupData { + local_timeline_init_statuses, + remote_index: RemoteIndex::empty(), + }) + } + } +} + +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result)>> { + let mut local_tenant_timeline_files = HashMap::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { + Ok(collected_files) => { + local_tenant_timeline_files.extend(collected_files.into_iter()) + } + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + Ok(local_tenant_timeline_files) +} + +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result)>> { + let mut timelines = HashMap::new(); + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| { + format!( + "Failed to list timelines dir entry for tenant {}", + tenant_id + ) + })? { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_path = timelines_dir_entry.path(); + match collect_timeline_files(&timeline_path) { + Ok((timeline_id, metadata, timeline_files)) => { + timelines.insert( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + (metadata, timeline_files), + ); + } + Err(e) => error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_path.display(), + e + ), + } + } + Err(e) => error!( + "Failed to list timelines for entry tenant {}, reason: {:?}", + tenant_id, e + ), + } + } + + Ok(timelines) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); + let mut timeline_metadata_path = None; + + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse timeline id out of the timeline dir name")?; + let timeline_dir_entries = + std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + if entry_path.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if entry_path.extension().and_then(OsStr::to_str) + == Some(TEMP_DOWNLOAD_EXTENSION) + { + info!("removing temp download file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + timeline_files.insert(entry_path); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didnt appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + Ok((timeline_id, metadata, timeline_files)) +} + +/// Wraps mpsc channel bits around into a queue interface. +/// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. +struct SyncQueue { + len: AtomicUsize, + max_timelines_per_batch: NonZeroUsize, + sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>, +} + +impl SyncQueue { + fn new( + max_timelines_per_batch: NonZeroUsize, + ) -> (Self, UnboundedReceiver<(ZTenantTimelineId, SyncTask)>) { + let (sender, receiver) = mpsc::unbounded_channel(); + ( + Self { + len: AtomicUsize::new(0), + max_timelines_per_batch, + sender, + }, + receiver, + ) + } + + fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { + match self.sender.send((sync_id, new_task)) { + Ok(()) => { + self.len.fetch_add(1, Ordering::Relaxed); + } + Err(e) => { + error!("failed to push sync task to queue: {e}"); + } + } + } + + /// Fetches a task batch, getting every existing entry from the queue, grouping by timelines and merging the tasks for every timeline. + /// A timeline has to care to not to delete cetain layers from the remote storage before the corresponding uploads happen. + /// Otherwise, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. + /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). + async fn next_task_batch( + &self, + // The queue is based on two ends of a channel and has to be accessible statically without blocking for submissions from the sync code. + // Its receiver needs &mut, so we cannot place it in the same container with the other end and get both static and non-blocking access. + // Hence toss this around to use it from the sync loop directly as &mut. + sync_queue_receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ) -> HashMap { + // request the first task in blocking fashion to do less meaningless work + let (first_sync_id, first_task) = if let Some(first_task) = sync_queue_receiver.recv().await + { + self.len.fetch_sub(1, Ordering::Relaxed); + first_task + } else { + info!("Queue sender part was dropped, aborting"); + return HashMap::new(); + }; + let mut timelines_left_to_batch = self.max_timelines_per_batch.get() - 1; + let mut tasks_to_process = self.len(); + + let mut batches = HashMap::with_capacity(tasks_to_process); + batches.insert(first_sync_id, SyncTaskBatch::new(first_task)); + + let mut tasks_to_reenqueue = Vec::with_capacity(tasks_to_process); + + // Pull the queue channel until we get all tasks that were there at the beginning of the batch construction. + // Yet do not put all timelines in the batch, but only the first ones that fit the timeline limit. + // Still merge the rest of the pulled tasks and reenqueue those for later. + while tasks_to_process > 0 { + match sync_queue_receiver.try_recv() { + Ok((sync_id, new_task)) => { + self.len.fetch_sub(1, Ordering::Relaxed); + tasks_to_process -= 1; + + match batches.entry(sync_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), + hash_map::Entry::Vacant(v) => { + timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); + if timelines_left_to_batch == 0 { + tasks_to_reenqueue.push((sync_id, new_task)); + } else { + v.insert(SyncTaskBatch::new(new_task)); + } + } + } + } + Err(TryRecvError::Disconnected) => { + debug!("Sender disconnected, batch collection aborted"); + break; + } + Err(TryRecvError::Empty) => { + debug!("No more data in the sync queue, task batch is not full"); + break; + } + } + } + + debug!( + "Batched {} timelines, reenqueuing {}", + batches.len(), + tasks_to_reenqueue.len() + ); + for (id, task) in tasks_to_reenqueue { + self.push(id, task); + } + + batches + } + + fn len(&self) -> usize { + self.len.load(Ordering::Relaxed) + } +} + +/// A task to run in the async download/upload loop. +/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. +#[derive(Debug, Clone)] +enum SyncTask { + /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. + /// Not necessary more fresh than the one already uploaded. + Download(SyncData), + /// A certain amount of image files to download. + Upload(SyncData), + /// Delete remote files. + Delete(SyncData), +} + +/// Stores the data to synd and its retries, to evict the tasks failing to frequently. +#[derive(Debug, Clone, PartialEq, Eq)] +struct SyncData { + retries: u32, + data: T, +} + +impl SyncData { + fn new(retries: u32, data: T) -> Self { + Self { retries, data } + } +} + +impl SyncTask { + fn download(download_task: LayersDownload) -> Self { + Self::Download(SyncData::new(0, download_task)) + } + + fn upload(upload_task: LayersUpload) -> Self { + Self::Upload(SyncData::new(0, upload_task)) + } + + fn delete(delete_task: LayersDeletion) -> Self { + Self::Delete(SyncData::new(0, delete_task)) + } +} + +#[derive(Debug, Default, PartialEq, Eq)] +struct SyncTaskBatch { + upload: Option>, + download: Option>, + delete: Option>, +} + +impl SyncTaskBatch { + fn new(task: SyncTask) -> Self { + let mut new_self = Self::default(); + new_self.add(task); + new_self + } + + fn add(&mut self, task: SyncTask) { + match task { + SyncTask::Download(new_download) => match &mut self.download { + Some(batch_download) => { + batch_download.retries = batch_download.retries.min(new_download.retries); + batch_download + .data + .layers_to_skip + .extend(new_download.data.layers_to_skip.into_iter()); + } + None => self.download = Some(new_download), + }, + SyncTask::Upload(new_upload) => match &mut self.upload { + Some(batch_upload) => { + batch_upload.retries = batch_upload.retries.min(new_upload.retries); + + let batch_data = &mut batch_upload.data; + let new_data = new_upload.data; + batch_data + .layers_to_upload + .extend(new_data.layers_to_upload.into_iter()); + batch_data + .uploaded_layers + .extend(new_data.uploaded_layers.into_iter()); + if batch_data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + <= new_data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + { + batch_data.metadata = new_data.metadata; + } + } + None => self.upload = Some(new_upload), + }, + SyncTask::Delete(new_delete) => match &mut self.delete { + Some(batch_delete) => { + batch_delete.retries = batch_delete.retries.min(new_delete.retries); + // Need to reregister deletions, but it's ok to register already deleted files once again, they will be skipped. + batch_delete.data.deletion_registered = batch_delete + .data + .deletion_registered + .min(new_delete.data.deletion_registered); + + // Do not download and upload the layers getting removed in the same batch + if let Some(batch_download) = &mut self.download { + batch_download + .data + .layers_to_skip + .extend(new_delete.data.layers_to_delete.iter().cloned()); + batch_download + .data + .layers_to_skip + .extend(new_delete.data.deleted_layers.iter().cloned()); + } + if let Some(batch_upload) = &mut self.upload { + let not_deleted = |layer: &PathBuf| { + !new_delete.data.layers_to_delete.contains(layer) + && !new_delete.data.deleted_layers.contains(layer) + }; + batch_upload.data.layers_to_upload.retain(not_deleted); + batch_upload.data.uploaded_layers.retain(not_deleted); + } + + batch_delete + .data + .layers_to_delete + .extend(new_delete.data.layers_to_delete.into_iter()); + batch_delete + .data + .deleted_layers + .extend(new_delete.data.deleted_layers.into_iter()); + } + None => self.delete = Some(new_delete), + }, + } + } +} + +/// Local timeline files for upload, appeared after the new checkpoint. +/// Current checkpoint design assumes new files are added only, no deletions or amendment happens. +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersUpload { + /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. + layers_to_upload: HashSet, + /// Already uploaded layers. Used to store the data about the uploads between task retries + /// and to record the data into the remote index after the task got completed or evicted. + uploaded_layers: HashSet, + metadata: Option, +} + +/// A timeline download task. +/// Does not contain the file list to download, to allow other +/// parts of the pageserer code to schedule the task +/// without using the remote index or any other ways to list the remote timleine files. +/// Skips the files that are already downloaded. +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersDownload { + layers_to_skip: HashSet, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersDeletion { + layers_to_delete: HashSet, + deleted_layers: HashSet, + /// Pageserver uses [`IndexPart`] as a source of truth for listing the files per timeline. + /// This object gets serialized and placed into the remote storage. + /// So if we manage to update pageserver's [`RemoteIndex`] and update the index part on the remote storage, + /// the corresponding files on S3 won't exist for pageserver albeit being physically present on that remote storage still. + /// Then all that's left is to remove the files from the remote storage, without concerns about consistency. + deletion_registered: bool, +} + +/// Adds the new checkpoint files as an upload sync task to the queue. +/// On task failure, it gets retried again from the start a number of times. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_upload( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + layers_to_upload: HashSet, + metadata: Option, +) { + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::upload(LayersUpload { + layers_to_upload, + uploaded_layers: HashSet::new(), + metadata, + }), + ); + debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Adds the new files to delete as a deletion task to the queue. +/// On task failure, it gets retried again from the start a number of times. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_delete( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + layers_to_delete: HashSet, +) { + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::delete(LayersDeletion { + layers_to_delete, + deleted_layers: HashSet::new(), + deletion_registered: false, + }), + ); + debug!("Deletion task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Requests the download of the entire timeline for a given tenant. +/// No existing local files are currently overwritten, except the metadata file (if its disk_consistent_lsn is less than the downloaded one). +/// The metadata file is always updated last, to avoid inconsistencies. +/// +/// On any failure, the task gets retried, omitting already downloaded layers. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { + debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send download task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::download(LayersDownload { + layers_to_skip: HashSet::new(), + }), + ); + debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Uses a remote storage given to start the storage sync loop. +/// See module docs for loop step description. +pub(super) fn spawn_storage_sync_thread( + conf: &'static PageServerConf, + local_timeline_files: HashMap)>, + storage: S, + max_concurrent_timelines_sync: NonZeroUsize, + max_sync_errors: NonZeroU32, +) -> anyhow::Result +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let (sync_queue, sync_queue_receiver) = SyncQueue::new(max_concurrent_timelines_sync); + SYNC_QUEUE + .set(sync_queue) + .map_err(|_queue| anyhow!("Could not initialize sync queue"))?; + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => bail!("Could not get sync queue during the sync loop step, aborting"), + }; + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("Failed to create storage sync runtime")?; + + let applicable_index_parts = runtime.block_on(try_fetch_index_parts( + conf, + &storage, + local_timeline_files.keys().copied().collect(), + )); + + let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; + + let local_timeline_init_statuses = schedule_first_sync_tasks( + &mut runtime.block_on(remote_index.write()), + sync_queue, + local_timeline_files, + ); + + let loop_index = remote_index.clone(); + thread_mgr::spawn( + ThreadKind::StorageSync, + None, + None, + "Remote storage sync thread", + false, + move || { + storage_sync_loop( + runtime, + conf, + ( + Arc::new(storage), + loop_index, + sync_queue, + sync_queue_receiver, + ), + max_sync_errors, + ); + Ok(()) + }, + ) + .context("Failed to spawn remote storage sync thread")?; + Ok(SyncStartupData { + remote_index, + local_timeline_init_statuses, + }) +} + +fn storage_sync_loop( + runtime: Runtime, + conf: &'static PageServerConf, + (storage, index, sync_queue, mut sync_queue_receiver): ( + Arc, + RemoteIndex, + &SyncQueue, + UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ), + max_sync_errors: NonZeroU32, +) where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + info!("Starting remote storage sync loop"); + loop { + let loop_index = index.clone(); + let loop_storage = Arc::clone(&storage); + let loop_step = runtime.block_on(async { + tokio::select! { + step = loop_step( + conf, + (loop_storage, loop_index, sync_queue, &mut sync_queue_receiver), + max_sync_errors, + ) + .instrument(info_span!("storage_sync_loop_step")) => step, + _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), + } + }); + + match loop_step { + ControlFlow::Continue(new_timeline_states) => { + if new_timeline_states.is_empty() { + debug!("Sync loop step completed, no new timeline states"); + } else { + info!( + "Sync loop step completed, {} new timeline state update(s)", + new_timeline_states.len() + ); + // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. + apply_timeline_sync_status_updates(conf, &index, new_timeline_states); + } + } + ControlFlow::Break(()) => { + info!("Shutdown requested, stopping"); + break; + } + } + } +} + +async fn loop_step( + conf: &'static PageServerConf, + (storage, index, sync_queue, sync_queue_receiver): ( + Arc, + RemoteIndex, + &SyncQueue, + &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ), + max_sync_errors: NonZeroU32, +) -> ControlFlow<(), HashMap>> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let batched_tasks = sync_queue.next_task_batch(sync_queue_receiver).await; + + let remaining_queue_length = sync_queue.len(); + REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); + if remaining_queue_length > 0 || !batched_tasks.is_empty() { + info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + } else { + debug!("No tasks to process"); + return ControlFlow::Continue(HashMap::new()); + } + + let mut sync_results = batched_tasks + .into_iter() + .map(|(sync_id, batch)| { + let storage = Arc::clone(&storage); + let index = index.clone(); + async move { + let state_update = process_sync_task_batch( + conf, + (storage, index, sync_queue), + max_sync_errors, + sync_id, + batch, + ) + .instrument(info_span!("process_sync_task_batch", sync_id = %sync_id)) + .await; + (sync_id, state_update) + } + }) + .collect::>(); + + let mut new_timeline_states: HashMap< + ZTenantId, + HashMap, + > = HashMap::new(); + while let Some((sync_id, state_update)) = sync_results.next().await { + debug!("Finished storage sync task for sync id {sync_id}"); + if let Some(state_update) = state_update { + new_timeline_states + .entry(sync_id.tenant_id) + .or_default() + .insert(sync_id.timeline_id, state_update); + } + } + + ControlFlow::Continue(new_timeline_states) +} + +async fn process_sync_task_batch( + conf: &'static PageServerConf, + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + max_sync_errors: NonZeroU32, + sync_id: ZTenantTimelineId, + batch: SyncTaskBatch, +) -> Option +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let sync_start = Instant::now(); + let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; + + let upload_data = batch.upload.clone(); + let download_data = batch.download.clone(); + // Run both upload and download tasks concurrently (not in parallel): + // download and upload tasks do not conflict and spoil the pageserver state even if they are executed in parallel. + // Under "spoiling" here means potentially inconsistent layer set that misses some of the layers, declared present + // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. + // When operating in a system without tasks failing over the error threshold, + // current batching and task processing systems aim to update the layer set and metadata files (remote and local), + // without "loosing" such layer files. + let (upload_result, status_update) = tokio::join!( + async { + if let Some(upload_data) = upload_data { + match validate_task_retries(upload_data, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(new_upload_data) => { + upload_timeline_data( + conf, + (storage.as_ref(), &index, sync_queue), + current_remote_timeline.as_ref(), + sync_id, + new_upload_data, + sync_start, + "upload", + ) + .await; + return Some(()); + } + ControlFlow::Break(failed_upload_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: failed_upload_data.data, + upload_failed: true, + }, + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } + } + } + } + None + } + .instrument(info_span!("upload_timeline_data")), + async { + if let Some(download_data) = download_data { + match validate_task_retries(download_data, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(new_download_data) => { + return download_timeline_data( + conf, + (storage.as_ref(), &index, sync_queue), + current_remote_timeline.as_ref(), + sync_id, + new_download_data, + sync_start, + "download", + ) + .await; + } + ControlFlow::Break(_) => { + index + .write() + .await + .set_awaits_download(&sync_id, false) + .ok(); + } + } + } + None + } + .instrument(info_span!("download_timeline_data")), + ); + + if let Some(delete_data) = batch.delete { + if upload_result.is_some() { + match validate_task_retries(delete_data, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(new_delete_data) => { + delete_timeline_data( + conf, + (storage.as_ref(), &index, sync_queue), + sync_id, + new_delete_data, + sync_start, + "delete", + ) + .instrument(info_span!("delete_timeline_data")) + .await; + } + ControlFlow::Break(failed_delete_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } + } + } + } else { + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + warn!("Skipping delete task due to failed upload tasks, reenqueuing"); + } + } + + status_update +} + +async fn download_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: ZTenantTimelineId, + new_download_data: SyncData, + sync_start: Instant, + task_name: &str, +) -> Option +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + match download_timeline_layers( + conf, + storage, + sync_queue, + current_remote_timeline, + sync_id, + new_download_data, + ) + .await + { + DownloadedTimeline::Abort => { + register_sync_status(sync_start, task_name, None); + if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { + error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); + } + } + DownloadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_start, task_name, Some(false)); + } + DownloadedTimeline::Successful(mut download_data) => { + match update_local_metadata(conf, sync_id, current_remote_timeline).await { + Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { + Ok(()) => { + register_sync_status(sync_start, task_name, Some(true)); + return Some(TimelineSyncStatusUpdate::Downloaded); + } + Err(e) => { + error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); + } + }, + Err(e) => { + error!("Failed to update local timeline metadata: {e:?}"); + download_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Download(download_data)); + register_sync_status(sync_start, task_name, Some(false)); + } + } + } + } + + None +} + +async fn update_local_metadata( + conf: &'static PageServerConf, + sync_id: ZTenantTimelineId, + remote_timeline: Option<&RemoteTimeline>, +) -> anyhow::Result<()> { + let remote_metadata = match remote_timeline { + Some(timeline) => &timeline.metadata, + None => { + info!("No remote timeline to update local metadata from, skipping the update"); + return Ok(()); + } + }; + let remote_lsn = remote_metadata.disk_consistent_lsn(); + + let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_lsn = if local_metadata_path.exists() { + let local_metadata = read_metadata_file(&local_metadata_path) + .await + .with_context(|| { + format!( + "Failed to load local metadata from path '{}'", + local_metadata_path.display() + ) + })?; + + Some(local_metadata.disk_consistent_lsn()) + } else { + None + }; + + if local_lsn < Some(remote_lsn) { + info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); + // clone because spawn_blocking requires static lifetime + let cloned_metadata = remote_metadata.to_owned(); + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = sync_id; + tokio::task::spawn_blocking(move || { + LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + }) + .await + .with_context(|| { + format!( + "failed to join save_metadata task for {}", + local_metadata_path.display() + ) + })? + .with_context(|| { + format!( + "Failed to write remote metadata bytes locally to path '{}'", + local_metadata_path.display() + ) + })?; + } else { + info!("Local metadata at path '{}' has later disk consistent Lsn ({local_lsn:?}) than the remote one ({remote_lsn}), skipping the update", local_metadata_path.display()); + } + + Ok(()) +} + +async fn delete_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + sync_id: ZTenantTimelineId, + mut new_delete_data: SyncData, + sync_start: Instant, + task_name: &str, +) where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let timeline_delete = &mut new_delete_data.data; + + if !timeline_delete.deletion_registered { + if let Err(e) = update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Delete(&timeline_delete.layers_to_delete), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + new_delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); + register_sync_status(sync_start, task_name, Some(false)); + return; + } + } + timeline_delete.deletion_registered = true; + + let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; + register_sync_status(sync_start, task_name, Some(sync_status)); +} + +async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { + TimelineMetadata::from_bytes( + &fs::read(metadata_path) + .await + .context("Failed to read local metadata bytes from fs")?, + ) + .context("Failed to parse metadata bytes") +} + +async fn upload_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: ZTenantTimelineId, + new_upload_data: SyncData, + sync_start: Instant, + task_name: &str, +) where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut uploaded_data = match upload_timeline_layers( + storage, + sync_queue, + current_remote_timeline, + sync_id, + new_upload_data, + ) + .await + { + UploadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_start, task_name, Some(false)); + return; + } + UploadedTimeline::Successful(upload_data) => upload_data, + }; + + match update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: uploaded_data.data.clone(), + upload_failed: false, + }, + ) + .await + { + Ok(()) => { + register_sync_status(sync_start, task_name, Some(true)); + } + Err(e) => { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + uploaded_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); + register_sync_status(sync_start, task_name, Some(false)); + } + } +} + +enum RemoteDataUpdate<'a> { + Upload { + uploaded_data: LayersUpload, + upload_failed: bool, + }, + Delete(&'a HashSet), +} + +async fn update_remote_data( + conf: &'static PageServerConf, + storage: &S, + index: &RemoteIndex, + sync_id: ZTenantTimelineId, + update: RemoteDataUpdate<'_>, +) -> anyhow::Result<()> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + info!("Updating remote index for the timeline"); + let updated_remote_timeline = { + let mut index_accessor = index.write().await; + + match index_accessor.timeline_entry_mut(&sync_id) { + Some(existing_entry) => { + match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + if let Some(new_metadata) = uploaded_data.metadata.as_ref() { + if existing_entry.metadata.disk_consistent_lsn() + < new_metadata.disk_consistent_lsn() + { + existing_entry.metadata = new_metadata.clone(); + } + } + if upload_failed { + existing_entry.add_upload_failures( + uploaded_data.layers_to_upload.iter().cloned(), + ); + } else { + existing_entry + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } + } + RemoteDataUpdate::Delete(layers_to_remove) => { + existing_entry.remove_layers(layers_to_remove) + } + } + existing_entry.clone() + } + None => match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + let new_metadata = match uploaded_data.metadata.as_ref() { + Some(new_metadata) => new_metadata, + None => bail!("For timeline {sync_id} upload, there's no upload metadata and no remote index entry, cannot create a new one"), + }; + let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); + if upload_failed { + new_remote_timeline + .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); + } else { + new_remote_timeline + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } + + index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); + new_remote_timeline + } + RemoteDataUpdate::Delete(_) => { + warn!("No remote index entry for timeline {sync_id}, skipping deletion"); + return Ok(()); + } + }, + } + }; + + let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); + let new_index_part = + IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) + .context("Failed to create an index part from the updated remote timeline")?; + + info!("Uploading remote data for the timeline"); + upload_index_part(conf, storage, sync_id, new_index_part) + .await + .context("Failed to upload new index part") +} + +async fn validate_task_retries( + sync_data: SyncData, + max_sync_errors: NonZeroU32, +) -> ControlFlow, SyncData> { + let current_attempt = sync_data.retries; + let max_sync_errors = max_sync_errors.get(); + if current_attempt >= max_sync_errors { + error!( + "Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}", + ); + return ControlFlow::Break(sync_data); + } + + if current_attempt > 0 { + let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0); + info!("Waiting {seconds_to_wait} seconds before starting the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; + } + ControlFlow::Continue(sync_data) +} + +async fn try_fetch_index_parts( + conf: &'static PageServerConf, + storage: &S, + keys: HashSet, +) -> HashMap +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut index_parts = HashMap::with_capacity(keys.len()); + + let mut part_downloads = keys + .into_iter() + .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) + .collect::>(); + + while let Some((id, part_upload_result)) = part_downloads.next().await { + match part_upload_result { + Ok(index_part) => { + debug!("Successfully fetched index part for {id}"); + index_parts.insert(id, index_part); + } + Err(e) => warn!("Failed to fetch index part for {id}: {e}"), + } + } + + index_parts +} + +fn schedule_first_sync_tasks( + index: &mut RemoteTimelineIndex, + sync_queue: &SyncQueue, + local_timeline_files: HashMap)>, +) -> LocalTimelineInitStatuses { + let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); + + let mut new_sync_tasks = + VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); + + for (sync_id, (local_metadata, local_files)) in local_timeline_files { + match index.timeline_entry_mut(&sync_id) { + Some(remote_timeline) => { + let (timeline_status, awaits_download) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata, + local_files, + remote_timeline, + ); + let was_there = local_timeline_init_statuses + .entry(sync_id.tenant_id) + .or_default() + .insert(sync_id.timeline_id, timeline_status); + + if was_there.is_some() { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id + ); + } + remote_timeline.awaits_download = awaits_download; + } + None => { + // TODO (rodionov) does this mean that we've crashed during tenant creation? + // is it safe to upload this checkpoint? could it be half broken? + new_sync_tasks.push_back(( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload: local_files, + uploaded_layers: HashSet::new(), + metadata: Some(local_metadata), + }), + )); + local_timeline_init_statuses + .entry(sync_id.tenant_id) + .or_default() + .insert( + sync_id.timeline_id, + LocalTimelineInitStatus::LocallyComplete, + ); + } + } + } + + new_sync_tasks.into_iter().for_each(|(sync_id, task)| { + sync_queue.push(sync_id, task); + }); + local_timeline_init_statuses +} + +fn compare_local_and_remote_timeline( + new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, + sync_id: ZTenantTimelineId, + local_metadata: TimelineMetadata, + local_files: HashSet, + remote_entry: &RemoteTimeline, +) -> (LocalTimelineInitStatus, bool) { + let remote_files = remote_entry.stored_files(); + + // TODO probably here we need more sophisticated logic, + // if more data is available remotely can we just download whats there? + // without trying to upload something. It may be tricky, needs further investigation. + // For now looks strange that we can request upload + // and dowload for the same timeline simultaneously. + // (upload needs to be only for previously unsynced files, not whole timeline dir). + // If one of the tasks fails they will be reordered in the queue which can lead + // to timeline being stuck in evicted state + let number_of_layers_to_download = remote_files.difference(&local_files).count(); + let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { + new_sync_tasks.push_back(( + sync_id, + SyncTask::download(LayersDownload { + layers_to_skip: local_files.clone(), + }), + )); + (LocalTimelineInitStatus::NeedsSync, true) + // we do not need to manupulate with remote consistent lsn here + // because it will be updated when sync will be completed + } else { + (LocalTimelineInitStatus::LocallyComplete, false) + }; + + let layers_to_upload = local_files + .difference(remote_files) + .cloned() + .collect::>(); + if !layers_to_upload.is_empty() { + new_sync_tasks.push_back(( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload, + uploaded_layers: HashSet::new(), + metadata: Some(local_metadata), + }), + )); + // Note that status here doesn't change. + } + + (initial_timeline_status, awaits_download) +} + +fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { + let secs_elapsed = sync_start.elapsed().as_secs_f64(); + info!("Processed a sync task in {secs_elapsed:.2} seconds"); + match sync_status { + Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), + Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), + None => return, + } + .observe(secs_elapsed) +} + +#[cfg(test)] +mod test_utils { + use utils::lsn::Lsn; + + use crate::repository::repo_harness::RepoHarness; + + use super::*; + + pub(super) async fn create_local_timeline( + harness: &RepoHarness<'_>, + timeline_id: ZTimelineId, + filenames: &[&str], + metadata: TimelineMetadata, + ) -> anyhow::Result { + let timeline_path = harness.timeline_path(&timeline_id); + fs::create_dir_all(&timeline_path).await?; + + let mut layers_to_upload = HashSet::with_capacity(filenames.len()); + for &file in filenames { + let file_path = timeline_path.join(file); + fs::write(&file_path, dummy_contents(file).into_bytes()).await?; + layers_to_upload.insert(file_path); + } + + fs::write( + metadata_path(harness.conf, timeline_id, harness.tenant_id), + metadata.to_bytes()?, + ) + .await?; + + Ok(LayersUpload { + layers_to_upload, + uploaded_layers: HashSet::new(), + metadata: Some(metadata), + }) + } + + pub(super) fn dummy_contents(name: &str) -> String { + format!("contents for {name}") + } + + pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { + TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) + } +} + +#[cfg(test)] +mod tests { + use super::test_utils::dummy_metadata; + use crate::repository::repo_harness::TIMELINE_ID; + use hex_literal::hex; + use utils::lsn::Lsn; + + use super::*; + + const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + + #[tokio::test] + async fn separate_task_ids_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let sync_id_2 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + let sync_id_3 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + assert!(sync_id_2 != sync_id_3); + assert!(sync_id_3 != TEST_SYNC_ID); + + let download_task = SyncTask::download(LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk")]), + }); + let upload_task = SyncTask::upload(LayersUpload { + layers_to_upload: HashSet::from([PathBuf::from("up")]), + uploaded_layers: HashSet::from([PathBuf::from("upl")]), + metadata: Some(dummy_metadata(Lsn(2))), + }); + let delete_task = SyncTask::delete(LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }); + + sync_queue.push(TEST_SYNC_ID, download_task.clone()); + sync_queue.push(sync_id_2, upload_task.clone()); + sync_queue.push(sync_id_3, delete_task.clone()); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + batch.len(), + submitted_tasks_count, + "Batch should consist of all tasks submitted" + ); + + assert_eq!( + Some(SyncTaskBatch::new(download_task)), + batch.remove(&TEST_SYNC_ID) + ); + assert_eq!( + Some(SyncTaskBatch::new(upload_task)), + batch.remove(&sync_id_2) + ); + assert_eq!( + Some(SyncTaskBatch::new(delete_task)), + batch.remove(&sync_id_3) + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_separate_tasks_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let download = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk")]), + }; + let upload = LayersUpload { + layers_to_upload: HashSet::from([PathBuf::from("up")]), + uploaded_layers: HashSet::from([PathBuf::from("upl")]), + metadata: Some(dummy_metadata(Lsn(2))), + }; + let delete = LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }; + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::upload(upload.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::delete(delete.clone())); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + batch.len(), + 1, + "Queue should have one batch merged from 3 sync tasks of the same user" + ); + + assert_eq!( + Some(SyncTaskBatch { + upload: Some(SyncData { + retries: 0, + data: upload + }), + download: Some(SyncData { + retries: 0, + data: download + }), + delete: Some(SyncData { + retries: 0, + data: delete + }), + }), + batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks unchanged" + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_same_tasks_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(1).unwrap()); + let download_1 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk1")]), + }; + let download_2 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk2")]), + }; + let download_3 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk3")]), + }; + let download_4 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk4")]), + }; + + let sync_id_2 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_1.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_2.clone())); + sync_queue.push(sync_id_2, SyncTask::download(download_3.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_4.clone())); + assert_eq!(sync_queue.len(), 4); + + let mut smallest_batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + smallest_batch.len(), + 1, + "Queue should have one batch merged from the all sync tasks, but not the other user's task" + ); + assert_eq!( + Some(SyncTaskBatch { + download: Some(SyncData { + retries: 0, + data: LayersDownload { + layers_to_skip: { + let mut set = HashSet::new(); + set.extend(download_1.layers_to_skip.into_iter()); + set.extend(download_2.layers_to_skip.into_iter()); + set.extend(download_4.layers_to_skip.into_iter()); + set + }, + } + }), + upload: None, + delete: None, + }), + smallest_batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks merged for the tenant first appeared in the batch" + ); + + assert!(smallest_batch.is_empty(), "Should check all batch tasks"); + assert_eq!( + sync_queue.len(), + 1, + "Should have one task left out of the batch" + ); + } +} diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs new file mode 100644 index 0000000000..047ad6c2be --- /dev/null +++ b/pageserver/src/storage_sync/delete.rs @@ -0,0 +1,228 @@ +//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tracing::{debug, error, info}; + +use crate::storage_sync::{SyncQueue, SyncTask}; +use remote_storage::RemoteStorage; +use utils::zid::ZTenantTimelineId; + +use super::{LayersDeletion, SyncData}; + +/// Attempts to remove the timleline layers from the remote storage. +/// If the task had not adjusted the metadata before, the deletion will fail. +pub(super) async fn delete_timeline_layers<'a, P, S>( + storage: &'a S, + sync_queue: &SyncQueue, + sync_id: ZTenantTimelineId, + mut delete_data: SyncData, +) -> bool +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + if !delete_data.data.deletion_registered { + error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + return false; + } + + if delete_data.data.layers_to_delete.is_empty() { + info!("No layers to delete, skipping"); + return true; + } + + let layers_to_delete = delete_data + .data + .layers_to_delete + .drain() + .collect::>(); + debug!("Layers to delete: {layers_to_delete:?}"); + info!("Deleting {} timeline layers", layers_to_delete.len()); + + let mut delete_tasks = layers_to_delete + .into_iter() + .map(|local_layer_path| async { + let storage_path = + match storage + .remote_object_id(&local_layer_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + }) { + Ok(path) => path, + Err(e) => return Err((e, local_layer_path)), + }; + + match storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) { + Ok(()) => Ok(local_layer_path), + Err(e) => Err((e, local_layer_path)), + } + }) + .collect::>(); + + let mut errored = false; + while let Some(deletion_result) = delete_tasks.next().await { + match deletion_result { + Ok(local_layer_path) => { + debug!( + "Successfully deleted layer {} for timeline {sync_id}", + local_layer_path.display() + ); + delete_data.data.deleted_layers.insert(local_layer_path); + } + Err((e, local_layer_path)) => { + errored = true; + error!( + "Failed to delete layer {} for timeline {sync_id}: {e:?}", + local_layer_path.display() + ); + delete_data.data.layers_to_delete.insert(local_layer_path); + } + } + } + + if errored { + debug!("Reenqueuing failed delete task for timeline {sync_id}"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } + errored +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, num::NonZeroUsize}; + + use itertools::Itertools; + use tempfile::tempdir; + use tokio::fs; + use utils::lsn::Lsn; + + use crate::{ + repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + }; + use remote_storage::LocalFs; + + use super::*; + + #[tokio::test] + async fn delete_timeline_negative() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline_negative")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: 1, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::new(), + deletion_registered: false, + }, + }, + ) + .await; + + assert!( + !deleted, + "Should not start the deletion for task with delete metadata unregistered" + ); + + Ok(()) + } + + #[tokio::test] + async fn delete_timeline() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "c", "d"]; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + for local_path in timeline_upload.layers_to_upload { + let remote_path = storage.remote_object_id(&local_path)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + layer_files + .iter() + .map(|layer_str| layer_str.to_string()) + .sorted() + .collect::>(), + "Expect to have all layer files remotely before deletion" + ); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: current_retries, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::from([ + local_timeline_path.join("a"), + local_timeline_path.join("c"), + local_timeline_path.join("something_different"), + ]), + deletion_registered: true, + }, + }, + ) + .await; + assert!(deleted, "Should be able to delete timeline files"); + + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + vec!["b".to_string(), "d".to_string()], + "Expect to have only non-deleted files remotely" + ); + + Ok(()) + } +} diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs similarity index 88% rename from pageserver/src/remote_storage/storage_sync/download.rs rename to pageserver/src/storage_sync/download.rs index 7e2496b796..98a0a0e2fc 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -4,6 +4,7 @@ use std::{collections::HashSet, fmt::Debug, path::Path}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::{path_with_suffix_extension, RemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -11,18 +12,13 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{ - config::PageServerConf, - layered_repository::metadata::metadata_path, - remote_storage::{ - storage_sync::{path_with_suffix_extension, sync_queue, SyncTask}, - RemoteStorage, - }, + config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; use utils::zid::ZTenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, - SyncData, TimelineDownload, + LayersDownload, SyncData, SyncQueue, }; pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; @@ -35,17 +31,19 @@ pub async fn download_index_part( ) -> anyhow::Result where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let part_storage_path = storage.storage_path(&index_part_path).with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; + let part_storage_path = storage + .remote_object_id(&index_part_path) + .with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; let mut index_part_bytes = Vec::new(); storage .download(&part_storage_path, &mut index_part_bytes) @@ -76,7 +74,7 @@ pub(super) enum DownloadedTimeline { FailedAndRescheduled, /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. /// Initial download successful. - Successful(SyncData), + Successful(SyncData), } /// Attempts to download all given timeline's layers. @@ -87,13 +85,14 @@ pub(super) enum DownloadedTimeline { pub(super) async fn download_timeline_layers<'a, P, S>( conf: &'static PageServerConf, storage: &'a S, + sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - mut download_data: SyncData, + mut download_data: SyncData, ) -> DownloadedTimeline where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let remote_timeline = match remote_timeline { Some(remote_timeline) => { @@ -120,6 +119,11 @@ where debug!("Layers to download: {layers_to_download:?}"); info!("Downloading {} timeline layers", layers_to_download.len()); + if layers_to_download.is_empty() { + info!("No layers to download after filtering, skipping"); + return DownloadedTimeline::Successful(download_data); + } + let mut download_tasks = layers_to_download .into_iter() .map(|layer_desination_path| async move { @@ -130,7 +134,7 @@ where ); } else { let layer_storage_path = storage - .storage_path(&layer_desination_path) + .remote_object_id(&layer_desination_path) .with_context(|| { format!( "Failed to get the layer storage path for local path '{}'", @@ -246,7 +250,7 @@ where if errors_happened { debug!("Reenqueuing failed download task for timeline {sync_id}"); download_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Download(download_data)); + sync_queue.push(sync_id, SyncTask::Download(download_data)); DownloadedTimeline::FailedAndRescheduled } else { info!("Successfully downloaded all layers"); @@ -260,20 +264,21 @@ async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { #[cfg(test)] mod tests { - use std::collections::{BTreeSet, HashSet}; + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + }; + use remote_storage::{LocalFs, RemoteStorage}; use tempfile::tempdir; use utils::lsn::Lsn; use crate::{ - remote_storage::{ - storage_sync::{ - index::RelativePath, - test_utils::{create_local_timeline, dummy_metadata}, - }, - LocalFs, - }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, }; use super::*; @@ -281,9 +286,14 @@ mod tests { #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -291,7 +301,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.storage_path(&local_path)?; + let remote_path = storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -318,11 +328,12 @@ mod tests { let download_data = match download_timeline_layers( harness.conf, &storage, + &sync_queue, Some(&remote_timeline), sync_id, SyncData::new( current_retries, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]), }, ), @@ -374,17 +385,19 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline_negatives")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let empty_remote_timeline_download = download_timeline_layers( harness.conf, &storage, + &sync_queue, None, sync_id, SyncData::new( 0, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::new(), }, ), @@ -403,11 +416,12 @@ mod tests { let already_downloading_remote_timeline_download = download_timeline_layers( harness.conf, &storage, + &sync_queue, Some(¬_expecting_download_remote_timeline), sync_id, SyncData::new( 0, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::new(), }, ), @@ -429,7 +443,10 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -450,7 +467,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let storage_path = storage.storage_path(&local_index_part_path)?; + let storage_path = storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs similarity index 97% rename from pageserver/src/remote_storage/storage_sync/index.rs rename to pageserver/src/storage_sync/index.rs index d847e03a24..7764a810bc 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -8,7 +8,7 @@ use std::{ sync::Arc, }; -use anyhow::{Context, Ok}; +use anyhow::{anyhow, Context, Ok}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; @@ -113,7 +113,7 @@ impl RemoteTimelineIndex { awaits_download: bool, ) -> anyhow::Result<()> { self.timeline_entry_mut(id) - .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))? + .ok_or_else(|| anyhow!("unknown timeline sync {id}"))? .awaits_download = awaits_download; Ok(()) } @@ -147,6 +147,13 @@ impl RemoteTimeline { self.missing_layers.extend(upload_failures.into_iter()); } + pub fn remove_layers(&mut self, layers_to_remove: &HashSet) { + self.timeline_layers + .retain(|layer| !layers_to_remove.contains(layer)); + self.missing_layers + .retain(|layer| !layers_to_remove.contains(layer)); + } + /// Lists all layer files in the given remote timeline. Omits the metadata file. pub fn stored_files(&self) -> &HashSet { &self.timeline_layers diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs similarity index 80% rename from pageserver/src/remote_storage/storage_sync/upload.rs rename to pageserver/src/storage_sync/upload.rs index d2ff77e92e..f9d606f2b8 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,20 +4,19 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, - layered_repository::metadata::metadata_path, - remote_storage::{ - storage_sync::{index::RemoteTimeline, sync_queue, SyncTask}, - RemoteStorage, - }, -}; use utils::zid::ZTenantTimelineId; -use super::{index::IndexPart, SyncData, TimelineUpload}; +use super::{ + index::{IndexPart, RemoteTimeline}, + LayersUpload, SyncData, SyncQueue, +}; +use crate::{ + config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, +}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -28,7 +27,7 @@ pub(super) async fn upload_index_part( ) -> anyhow::Result<()> where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; @@ -38,12 +37,15 @@ where let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; + let index_part_storage_path = + storage + .remote_object_id(&index_part_path) + .with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; storage .upload( @@ -64,11 +66,7 @@ pub(super) enum UploadedTimeline { /// Upload failed due to some error, the upload task is rescheduled for another retry. FailedAndRescheduled, /// No issues happened during the upload, all task files were put into the remote storage. - Successful(SyncData), - /// No failures happened during the upload, but some files were removed locally before the upload task completed - /// (could happen due to retries, for instance, if GC happens in the interim). - /// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file. - SuccessfulAfterLocalFsUpdate(SyncData), + Successful(SyncData), } /// Attempts to upload given layer files. @@ -77,16 +75,20 @@ pub(super) enum UploadedTimeline { /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layers<'a, P, S>( storage: &'a S, + sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - mut upload_data: SyncData, + mut upload_data: SyncData, ) -> UploadedTimeline where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let upload = &mut upload_data.data; - let new_upload_lsn = upload.metadata.disk_consistent_lsn(); + let new_upload_lsn = upload + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()); let already_uploaded_layers = remote_timeline .map(|timeline| timeline.stored_files()) @@ -99,9 +101,14 @@ where .cloned() .collect::>(); + if layers_to_upload.is_empty() { + info!("No layers to upload after filtering, aborting"); + return UploadedTimeline::Successful(upload_data); + } + debug!("Layers to upload: {layers_to_upload:?}"); info!( - "Uploading {} timeline layers, new lsn: {new_upload_lsn}", + "Uploading {} timeline layers, new lsn: {new_upload_lsn:?}", layers_to_upload.len(), ); @@ -109,7 +116,7 @@ where .into_iter() .map(|source_path| async move { let storage_path = storage - .storage_path(&source_path) + .remote_object_id(&source_path) .with_context(|| { format!( "Failed to get the layer storage path for local path '{}'", @@ -156,7 +163,6 @@ where .collect::>(); let mut errors_happened = false; - let mut local_fs_updated = false; while let Some(upload_result) = upload_tasks.next().await { match upload_result { Ok(uploaded_path) => { @@ -173,7 +179,16 @@ where errors_happened = true; error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); } else { - local_fs_updated = true; + // We have run the upload sync task, but the file we wanted to upload is gone. + // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to + // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and + // run compaction/gc threads, removing redundant files from disk. + // It's not good to pause GC/compaction because of those and we would rather skip such uploads. + // + // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). + // We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet. + // This will create "missing" layers and make data inconsistent. + // Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result. upload.layers_to_upload.remove(&source_path); warn!( "Missing locally a layer file {} scheduled for upload, skipping", @@ -188,11 +203,8 @@ where if errors_happened { debug!("Reenqueuing failed upload task for timeline {sync_id}"); upload_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(upload_data)); + sync_queue.push(sync_id, SyncTask::Upload(upload_data)); UploadedTimeline::FailedAndRescheduled - } else if local_fs_updated { - info!("Successfully uploaded all layers, some local layers were removed during the upload"); - UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) } else { info!("Successfully uploaded all layers"); UploadedTimeline::Successful(upload_data) @@ -206,20 +218,21 @@ enum UploadError { #[cfg(test)] mod tests { - use std::collections::{BTreeSet, HashSet}; + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + }; + use remote_storage::LocalFs; use tempfile::tempdir; use utils::lsn::Lsn; use crate::{ - remote_storage::{ - storage_sync::{ - index::RelativePath, - test_utils::{create_local_timeline, dummy_metadata}, - }, - LocalFs, - }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, }; use super::{upload_index_part, *}; @@ -227,15 +240,21 @@ mod tests { #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { let harness = RepoHarness::create("regular_layer_upload")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); - let timeline_upload = + let mut timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + timeline_upload.metadata = None; + assert!( storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" @@ -243,6 +262,7 @@ mod tests { let upload_result = upload_timeline_layers( &storage, + &sync_queue, None, sync_id, SyncData::new(current_retries, timeline_upload.clone()), @@ -278,8 +298,8 @@ mod tests { "Successful upload should have all layers uploaded" ); assert_eq!( - upload.metadata, metadata, - "Successful upload should not chage its metadata" + upload.metadata, None, + "Successful upload without metadata should not have it returned either" ); let storage_files = storage.list().await?; @@ -307,10 +327,11 @@ mod tests { #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let current_retries = 5; let metadata = dummy_metadata(Lsn(0x40)); @@ -332,6 +353,7 @@ mod tests { let upload_result = upload_timeline_layers( &storage, + &sync_queue, None, sync_id, SyncData::new(current_retries, timeline_upload.clone()), @@ -339,7 +361,7 @@ mod tests { .await; let upload_data = match upload_result { - UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data, + UploadedTimeline::Successful(upload_data) => upload_data, wrong_result => panic!( "Expected a successful after local fs upload for timeline, but got: {wrong_result:?}" ), @@ -367,7 +389,8 @@ mod tests { "Successful upload should have all layers uploaded" ); assert_eq!( - upload.metadata, metadata, + upload.metadata, + Some(metadata), "Successful upload should not chage its metadata" ); @@ -397,7 +420,7 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let metadata = dummy_metadata(Lsn(0x40)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 3e0a907d00..20a723b5b5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,8 +4,9 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::DatadirTimeline; -use crate::remote_storage::{self, LocalTimelineInitStatus, RemoteIndex, SyncStartupData}; use crate::repository::{Repository, TimelineSyncStatusUpdate}; +use crate::storage_sync::index::RemoteIndex; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -96,7 +97,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result anyhow::Result<()> { Some(tenant_id), None, "Compactor thread", - true, + false, move || crate::tenant_threads::compact_loop(tenant_id), )?; @@ -253,7 +254,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { Some(tenant_id), None, "GC thread", - true, + false, move || crate::tenant_threads::gc_loop(tenant_id), ) .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 2866c6be44..b908f220ee 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -130,12 +130,14 @@ struct PageServerThread { } /// Launch a new thread +/// Note: if shutdown_process_on_error is set to true failure +/// of the thread will lead to shutdown of entire process pub fn spawn( kind: ThreadKind, tenant_id: Option, timeline_id: Option, name: &str, - fail_on_error: bool, + shutdown_process_on_error: bool, f: F, ) -> std::io::Result<()> where @@ -175,7 +177,7 @@ where thread_id, thread_rc2, shutdown_rx, - fail_on_error, + shutdown_process_on_error, f, ) }) { @@ -201,7 +203,7 @@ fn thread_wrapper( thread_id: u64, thread: Arc, shutdown_rx: watch::Receiver<()>, - fail_on_error: bool, + shutdown_process_on_error: bool, f: F, ) where F: FnOnce() -> anyhow::Result<()> + Send + 'static, @@ -221,27 +223,41 @@ fn thread_wrapper( let result = panic::catch_unwind(AssertUnwindSafe(f)); // Remove our entry from the global hashmap. - THREADS.lock().unwrap().remove(&thread_id); + let thread = THREADS + .lock() + .unwrap() + .remove(&thread_id) + .expect("no thread in registry"); match result { Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), Ok(Err(err)) => { - if fail_on_error { + if shutdown_process_on_error { error!( - "Shutting down: thread '{}' exited with error: {:?}", - thread_name, err + "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err ); - shutdown_pageserver(); + shutdown_pageserver(1); } else { - error!("Thread '{}' exited with error: {:?}", thread_name, err); + error!( + "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); } } Err(err) => { - error!( - "Shutting down: thread '{}' panicked: {:?}", - thread_name, err - ); - shutdown_pageserver(); + if shutdown_process_on_error { + error!( + "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); + shutdown_pageserver(1); + } else { + error!( + "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); + } } } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 85ad294da9..7cfd33c40b 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -23,8 +23,8 @@ use utils::{ use crate::{ config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::RemoteIndex, repository::{LocalTimelineState, Repository}, + storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, DatadirTimeline, RepositoryImpl, }; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 583cdecb1d..fbdb328d2c 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,6 +21,7 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use anyhow::Context; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; @@ -82,7 +83,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { ) -> Result<()> { let mut modification = timeline.begin_modification(lsn); - let mut decoded = decode_wal_record(recdata); + let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -251,7 +252,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode(); + let new_checkpoint_bytes = self.checkpoint.encode()?; modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; @@ -635,7 +636,10 @@ impl<'a, R: Repository> WalIngest<'a, R> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + ZenithWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { ZenithWalRecord::ClogSetAborted { xids: page_xids } }, @@ -652,7 +656,10 @@ impl<'a, R: Repository> WalIngest<'a, R> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + ZenithWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { ZenithWalRecord::ClogSetAborted { xids: page_xids } }, diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 5947a0c147..5a384360e2 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! +use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; @@ -9,6 +10,7 @@ use postgres_ffi::{BlockNumber, OffsetNumber}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::bin_ser::DeserializeError; /// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom zenith-specific "record". @@ -24,7 +26,10 @@ pub enum ZenithWalRecord { flags: u8, }, /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { xids: Vec }, + ClogSetCommitted { + xids: Vec, + timestamp: TimestampTz, + }, /// Mark transaction IDs as aborted on a CLOG page ClogSetAborted { xids: Vec }, /// Extend multixact offsets SLRU @@ -500,7 +505,7 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { +pub fn decode_wal_record(record: Bytes) -> Result { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -511,7 +516,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; trace!( "decode_wal_record xl_rmid = {} xl_info = {}", @@ -739,34 +744,32 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { assert_eq!(buf.remaining(), main_data_len as usize); } - DecodedWALRecord { + Ok(DecodedWALRecord { xl_xid: xlogrec.xl_xid, xl_info: xlogrec.xl_info, xl_rmid: xlogrec.xl_rmid, record, blocks, main_data_offset, - } + }) } /// /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> String { +pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => { - format!( - "will_init: {}, {}", - will_init, - describe_postgres_wal_record(rec) - ) - } - _ => format!("{:?}", rec), + ZenithWalRecord::Postgres { will_init, rec } => Ok(format!( + "will_init: {}, {}", + will_init, + describe_postgres_wal_record(rec)? + )), + _ => Ok(format!("{:?}", rec)), } } -fn describe_postgres_wal_record(record: &Bytes) -> String { +fn describe_postgres_wal_record(record: &Bytes) -> Result { // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this. // Maybe use the postgres wal redo process, the same used for replaying WAL records? // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly, @@ -779,7 +782,7 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; let unknown_str: String; @@ -827,5 +830,5 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { } }; - String::from(result) + Ok(String::from(result)) } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 6338b839ae..777718b311 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -283,6 +283,11 @@ impl PostgresRedoManager { // If something went wrong, don't try to reuse the process. Kill it, and // next request will launch a new one. if result.is_err() { + error!( + "error applying {} WAL records to reconstruct page image at LSN {}", + records.len(), + lsn + ); let process = process_guard.take().unwrap(); process.kill(); } @@ -387,7 +392,7 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids } => { + ZenithWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -421,6 +426,21 @@ impl PostgresRedoManager { page, ); } + + // Append the timestamp + if page.len() == pg_constants::BLCKSZ as usize + 8 { + page.truncate(pg_constants::BLCKSZ as usize); + } + if page.len() == pg_constants::BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } } ZenithWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = diff --git a/poetry.lock b/poetry.lock index fe18ad226c..a7cbe0aa3c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -822,7 +822,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.0.4" +version = "3.1.7" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -844,6 +844,7 @@ importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" +pyparsing = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} pytz = "*" @@ -855,7 +856,7 @@ werkzeug = "*" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools"] +all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools"] apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] @@ -864,14 +865,16 @@ batch = ["docker (>=2.5.1)"] cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"] cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] ds = ["sshpubkeys (>=3.1.0)"] +dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +glue = ["pyparsing (>=3.0.0)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools", "flask", "flask-cors"] +server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools", "flask", "flask-cors"] ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -1068,6 +1071,17 @@ python-versions = ">=3.6" py = "*" pytest = ">=3.10" +[[package]] +name = "pytest-lazy-fixture" +version = "0.6.3" +description = "It helps to use fixtures in pytest.mark.parametrize" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pytest = ">=3.2.5" + [[package]] name = "pytest-xdist" version = "2.5.0" @@ -1361,7 +1375,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "58762accad4122026c650fa43421a900546e89f9908e2268410e7b11cc8c6c4e" +content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e" [metadata.files] aiopg = [ @@ -1679,8 +1693,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.0.4-py2.py3-none-any.whl", hash = "sha256:79646213d8438385182f4eea79e28725f94b3d0d3dc9a3eda81db47e0ebef6cc"}, - {file = "moto-3.0.4.tar.gz", hash = "sha256:168b8a3cb4dd8a6df8e51d582761cefa9657b9f45ac7e1eb24dae394ebc9e000"}, + {file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"}, + {file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"}, ] mypy = [ {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, @@ -1855,6 +1869,10 @@ pytest-forked = [ {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, ] +pytest-lazy-fixture = [ + {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, + {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, +] pytest-xdist = [ {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f7e872ceb9..43880d645a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] anyhow = "1.0" +async-trait = "0.1" base64 = "0.13.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" @@ -31,13 +32,13 @@ thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" +url = "2.2.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] -async-trait = "0.1" rcgen = "0.8.14" rstest = "0.12" tokio-postgres-rustls = "0.9.0" diff --git a/proxy/README.md b/proxy/README.md new file mode 100644 index 0000000000..458a7d9bbf --- /dev/null +++ b/proxy/README.md @@ -0,0 +1,33 @@ +# Proxy + +Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: + +* legacy + old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back +* console + new SCRAM-based console API; uses SNI info to select the destination cluster +* postgres + uses postgres to select auth secrets of existing roles. Useful for local testing +* link + sends login link for all usernames + +## Using SNI-based routing on localhost + +Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: + +``` +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" + +``` + +now you can start proxy: + +``` +./target/debug/proxy -c server.crt -k server.key +``` + +and connect to it: + +``` +PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' +``` diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index c6d32040dc..2463f31645 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,22 +1,17 @@ mod credentials; - -#[cfg(test)] mod flow; -use crate::compute::DatabaseInfo; -use crate::config::ProxyConfig; -use crate::cplane_api::{self, CPlaneApi}; +use crate::auth_backend::{console, legacy_console, link, postgres}; +use crate::config::{AuthBackendType, ProxyConfig}; use crate::error::UserFacingError; use crate::stream::PqStream; -use crate::waiters; +use crate::{auth_backend, compute, waiters}; +use console::ConsoleAuthError::SniMissing; use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub use credentials::ClientCredentials; - -#[cfg(test)] pub use flow::*; /// Common authentication error. @@ -24,9 +19,11 @@ pub use flow::*; pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] cplane_api::AuthError), + Console(#[from] auth_backend::AuthError), + + #[error(transparent)] + GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError), - #[cfg(test)] #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -41,19 +38,19 @@ pub enum AuthErrorImpl { impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + AuthErrorImpl::Console(auth_backend::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + AuthErrorImpl::Console(auth_backend::AuthError::from(e)) } } @@ -76,112 +73,33 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Console(e) => e.to_string_client(), MalformedPassword => self.to_string(), + GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(), _ => "Internal error".to_string(), } } } -async fn handle_static( - host: String, - port: u16, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - client - .write_message(&Be::AuthenticationCleartextPassword) - .await?; - - // Read client's password bytes - let msg = client.read_password_message().await?; - let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - - let db_info = DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: Some(cleartext_password.into()), - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(db_info) -} - -async fn handle_existing_user( +async fn handle_user( config: &ProxyConfig, - client: &mut PqStream, + client: &mut PqStream, creds: ClientCredentials, -) -> Result { - let psql_session_id = new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(md5_salt)) - .await?; - - // Read client's password hash - let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - - let cplane = CPlaneApi::new(config.auth_endpoint.clone()); - let db_info = cplane - .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) - .await?; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(db_info) -} - -async fn handle_new_user( - config: &ProxyConfig, - client: &mut PqStream, -) -> Result { - let psql_session_id = new_psql_session_id(); - let greeting = hello_message(&config.redirect_uri, &psql_session_id); - - let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async { - // Give user a URL to spawn a new database - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&Be::NoticeResponse(&greeting)) - .await?; - - // Wait for web console response (see `mgmt`) - waiter.await?.map_err(AuthErrorImpl::auth_failed) - }) - .await?; - - client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; - - Ok(db_info) -} - -fn new_psql_session_id() -> String { - hex::encode(rand::random::<[u8; 8]>()) -} - -fn parse_password(bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(bytes).ok()?.strip_suffix('\0') -} - -fn hello_message(redirect_uri: &str, session_id: &str) -> String { - format!( - concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", - " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." - ], - redirect_uri = redirect_uri, - session_id = session_id, - ) +) -> Result { + match config.auth_backend { + AuthBackendType::LegacyConsole => { + legacy_console::handle_user( + &config.auth_endpoint, + &config.auth_link_uri, + client, + &creds, + ) + .await + } + AuthBackendType::Console => { + console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await + } + AuthBackendType::Postgres => { + postgres::handle_user(&config.auth_endpoint, client, &creds).await + } + AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await, + } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c3bb6da4f8..9d2272b5ad 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,7 +1,7 @@ //! User credentials used in authentication. use super::AuthError; -use crate::compute::DatabaseInfo; +use crate::compute; use crate::config::ProxyConfig; use crate::error::UserFacingError; use crate::stream::PqStream; @@ -18,10 +18,22 @@ pub enum ClientCredsParseError { impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. -#[derive(Debug, PartialEq, Eq)] +/// Note that we don't store any kind of client key or password here. +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ClientCredentials { pub user: String, pub dbname: String, + + // New console API requires SNI info to determine the cluster name. + // Other Auth backends don't need it. + pub sni_data: Option, +} + +impl ClientCredentials { + pub fn is_existing_user(&self) -> bool { + // This logic will likely change in the future. + self.user.ends_with("@zenith") + } } impl TryFrom> for ClientCredentials { @@ -37,7 +49,11 @@ impl TryFrom> for ClientCredentials { let user = get_param("user")?; let db = get_param("database")?; - Ok(Self { user, dbname: db }) + Ok(Self { + user, + dbname: db, + sni_data: None, + }) } } @@ -46,21 +62,9 @@ impl ClientCredentials { pub async fn authenticate( self, config: &ProxyConfig, - client: &mut PqStream, - ) -> Result { - use crate::config::ClientAuthMethod::*; - use crate::config::RouterConfig::*; - match &config.router_config { - Static { host, port } => super::handle_static(host.clone(), *port, client, self).await, - Dynamic(Mixed) => { - if self.user.ends_with("@zenith") { - super::handle_existing_user(config, client, self).await - } else { - super::handle_new_user(config, client).await - } - } - Dynamic(Password) => super::handle_existing_user(config, client, self).await, - Dynamic(Link) => super::handle_new_user(config, client).await, - } + client: &mut PqStream, + ) -> Result { + // This method is just a convenient facade for `handle_user` + super::handle_user(config, client, self).await } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index bcfd94a9ed..3eed0f0a23 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -27,19 +27,6 @@ impl AuthMethod for Scram<'_> { } } -/// Use password-based auth in [`AuthFlow`]. -pub struct Md5( - /// Salt for client. - pub [u8; 4], -); - -impl AuthMethod for Md5 { - #[inline(always)] - fn first_message(&self) -> BeMessage<'_> { - Be::AuthenticationMD5Password(self.0) - } -} - /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -70,19 +57,10 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } } -/// Stream wrapper for handling simple MD5 password auth. -impl AuthFlow<'_, S, Md5> { - /// Perform user authentication. Raise an error in case authentication failed. - #[allow(unused)] - pub async fn authenticate(self) -> Result<(), AuthError> { - unimplemented!("MD5 auth flow is yet to be implemented"); - } -} - /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> Result<(), AuthError> { + pub async fn authenticate(self) -> Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; @@ -93,10 +71,10 @@ impl AuthFlow<'_, S, Scram<'_>> { } let secret = self.state.0; - sasl::SaslStream::new(self.stream, sasl.message) + let key = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new(secret, rand::random, None)) .await?; - Ok(()) + Ok(key) } } diff --git a/proxy/src/auth_backend.rs b/proxy/src/auth_backend.rs new file mode 100644 index 0000000000..54362bf719 --- /dev/null +++ b/proxy/src/auth_backend.rs @@ -0,0 +1,31 @@ +pub mod console; +pub mod legacy_console; +pub mod link; +pub mod postgres; + +pub use legacy_console::{AuthError, AuthErrorImpl}; + +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; +use lazy_static::lazy_static; + +lazy_static! { + static ref CPLANE_WAITERS: Waiters = Default::default(); +} + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs new file mode 100644 index 0000000000..55a0889af4 --- /dev/null +++ b/proxy/src/auth_backend/console.rs @@ -0,0 +1,243 @@ +//! Declaration of Cloud API V2. + +use crate::{ + auth::{self, AuthFlow}, + compute, scram, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::auth::ClientCredentials; +use crate::stream::PqStream; + +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +#[derive(Debug, Error)] +pub enum ConsoleAuthError { + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Bad client credentials: {0:?}")] + BadCredentials(crate::auth::ClientCredentials), + + #[error("SNI info is missing, please upgrade the postgres client library")] + SniMissing, + + #[error("Unexpected SNI content")] + SniWrong, + + #[error(transparent)] + BadUrl(#[from] url::ParseError), + + #[error(transparent)] + Io(#[from] std::io::Error), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error("Console responded with a malformed JSON: '{0}'")] + MalformedResponse(#[from] serde_json::Error), + + #[error("Console responded with a malformed compute address: '{0}'")] + MalformedComputeAddress(String), +} + +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + + /// [Cloud API V1](super::legacy) returns cleartext password, + /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) + /// authentication, so we can leverage this method and cope without password. + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +async fn get_auth_info( + auth_endpoint: &str, + user: &str, + cluster: &str, +) -> Result { + let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; + + url.query_pairs_mut() + .append_pair("cluster", cluster) + .append_pair("role", user); + + // TODO: use a proper logger + println!("cplane request: {}", url); + + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?; + + scram::ServerSecret::parse(response.role_secret.as_str()) + .map(AuthInfo::Scram) + .ok_or(ConsoleAuthError::BadSecret) +} + +/// Wake up the compute node and return the corresponding connection info. +async fn wake_compute( + auth_endpoint: &str, + cluster: &str, +) -> Result<(String, u16), ConsoleAuthError> { + let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; + url.query_pairs_mut().append_pair("cluster", cluster); + + // TODO: use a proper logger + println!("cplane request: {}", url); + + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?; + let (host, port) = response + .address + .split_once(':') + .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; + let port: u16 = port + .parse() + .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; + + Ok((host.to_string(), port)) +} + +pub async fn handle_user( + auth_endpoint: &str, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + // Determine cluster name from SNI. + let cluster = creds + .sni_data + .as_ref() + .ok_or(ConsoleAuthError::SniMissing)? + .split_once('.') + .ok_or(ConsoleAuthError::SniWrong)? + .0; + + let user = creds.user.as_str(); + + // Step 1: get the auth secret + let auth_info = get_auth_info(auth_endpoint, user, cluster).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + // Step 2: wake compute + let (host, port) = wake_compute(auth_endpoint, cluster).await?; + + Ok(compute::NodeInfo { + db_info: DatabaseInfo { + host, + port, + dbname: creds.dbname.clone(), + user: creds.user.clone(), + password: None, + }, + scram_keys, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/auth_backend/legacy_console.rs b/proxy/src/auth_backend/legacy_console.rs new file mode 100644 index 0000000000..29997d2389 --- /dev/null +++ b/proxy/src/auth_backend/legacy_console.rs @@ -0,0 +1,206 @@ +//! Cloud API V1. + +use super::console::DatabaseInfo; + +use crate::auth::ClientCredentials; +use crate::stream::PqStream; + +use crate::{compute, waiters}; +use serde::{Deserialize, Serialize}; + +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +use thiserror::Error; + +use crate::error::UserFacingError; + +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +// NOTE: the order of constructors is important. +// https://serde.rs/enum-representations.html#untagged +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +enum ProxyAuthResponse { + Ready { conn_info: DatabaseInfo }, + Error { error: String }, + NotReady { ready: bool }, // TODO: get rid of `ready` +} + +async fn authenticate_proxy_client( + auth_endpoint: &reqwest::Url, + creds: &ClientCredentials, + md5_response: &str, + salt: &[u8; 4], + psql_session_id: &str, +) -> Result { + let mut url = auth_endpoint.clone(); + url.query_pairs_mut() + .append_pair("login", &creds.user) + .append_pair("database", &creds.dbname) + .append_pair("md5response", md5_response) + .append_pair("salt", &hex::encode(salt)) + .append_pair("psql_session_id", psql_session_id); + + super::with_waiter(psql_session_id, |waiter| async { + println!("cloud request: {}", url); + // TODO: leverage `reqwest::Client` to reuse connections + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); + } + + let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; + println!("got auth info: #{:?}", auth_info); + + use ProxyAuthResponse::*; + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) + }) + .await +} + +async fn handle_existing_user( + auth_endpoint: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + let psql_session_id = super::link::new_psql_session_id(); + let md5_salt = rand::random(); + + client + .write_message(&Be::AuthenticationMD5Password(md5_salt)) + .await?; + + // Read client's password hash + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?; + + let db_info = authenticate_proxy_client( + auth_endpoint, + creds, + md5_response, + &md5_salt, + &psql_session_id, + ) + .await?; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) +} + +pub async fn handle_user( + auth_endpoint: &reqwest::Url, + auth_link_uri: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + if creds.is_existing_user() { + handle_existing_user(auth_endpoint, client, creds).await + } else { + super::link::handle_user(auth_link_uri.as_ref(), client).await + } +} + +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_proxy_auth_response() { + // Ready + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": true, + "conn_info": DatabaseInfo::default(), + })) + .unwrap(); + assert!(matches!( + auth, + ProxyAuthResponse::Ready { + conn_info: DatabaseInfo { .. } + } + )); + + // Error + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": false, + "error": "too bad, so sad", + })) + .unwrap(); + assert!(matches!(auth, ProxyAuthResponse::Error { .. })); + + // NotReady + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": false, + })) + .unwrap(); + assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); + } +} diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth_backend/link.rs new file mode 100644 index 0000000000..9bdb9e21c4 --- /dev/null +++ b/proxy/src/auth_backend/link.rs @@ -0,0 +1,52 @@ +use crate::{compute, stream::PqStream}; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +fn hello_message(redirect_uri: &str, session_id: &str) -> String { + format!( + concat![ + "☀️ Welcome to Neon!\n", + "To proceed with database creation, open the following link:\n\n", + " {redirect_uri}{session_id}\n\n", + "It needs to be done once and we will send you '.pgpass' file,\n", + "which will allow you to access or create ", + "databases without opening your web browser." + ], + redirect_uri = redirect_uri, + session_id = session_id, + ) +} + +pub fn new_psql_session_id() -> String { + hex::encode(rand::random::<[u8; 8]>()) +} + +pub async fn handle_user( + redirect_uri: &str, + client: &mut PqStream, +) -> Result { + let psql_session_id = new_psql_session_id(); + let greeting = hello_message(redirect_uri, &psql_session_id); + + let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async { + // Give user a URL to spawn a new database + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message(&Be::NoticeResponse(&greeting)) + .await?; + + // Wait for web console response (see `mgmt`) + waiter + .await? + .map_err(crate::auth::AuthErrorImpl::auth_failed) + }) + .await?; + + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; + + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) +} diff --git a/proxy/src/auth_backend/postgres.rs b/proxy/src/auth_backend/postgres.rs new file mode 100644 index 0000000000..148c2a2518 --- /dev/null +++ b/proxy/src/auth_backend/postgres.rs @@ -0,0 +1,93 @@ +//! Local mock of Cloud API V2. + +use super::console::{self, AuthInfo, DatabaseInfo}; +use crate::scram; +use crate::{auth::ClientCredentials, compute}; + +use crate::stream::PqStream; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +async fn get_auth_info( + auth_endpoint: &str, + creds: &ClientCredentials, +) -> Result { + // We wrap `tokio_postgres::Error` because we don't want to infect the + // method's error type with a detail that's specific to debug mode only. + let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); + + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())), + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(console::ConsoleAuthError::BadSecret) + } + } +} + +pub async fn handle_user( + auth_endpoint: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?; + + let flow = crate::auth::AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = crate::auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: DatabaseInfo { + // TODO: handle that near CLI params parsing + host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(), + port: auth_endpoint.port().unwrap_or(5432), + dbname: creds.dbname.to_owned(), + user: creds.user.to_owned(), + password: None, + }, + scram_keys, + }) +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3c0eee29bc..c3c5ba47fb 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,6 @@ +use crate::auth_backend::console::DatabaseInfo; use crate::cancellation::CancelClosure; use crate::error::UserFacingError; -use serde::{Deserialize, Serialize}; use std::io; use std::net::SocketAddr; use thiserror::Error; @@ -23,32 +23,21 @@ pub enum ConnectionError { impl UserFacingError for ConnectionError {} -/// Compute node connection params. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - /// PostgreSQL version as [`String`]. pub type Version = String; -impl DatabaseInfo { +/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. +pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; + +/// Compute node connection params. +pub struct NodeInfo { + pub db_info: DatabaseInfo, + pub scram_keys: Option, +} + +impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = format!("{}:{}", self.host, self.port); + let host_port = format!("{}:{}", self.db_info.host, self.db_info.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; socket2::SockRef::from(&socket).set_keepalive(true)?; @@ -63,11 +52,13 @@ impl DatabaseInfo { .await .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - // TODO: establish a secure connection to the DB - let (client, conn) = tokio_postgres::Config::from(self) - .connect_raw(&mut socket, NoTls) - .await?; + let mut config = tokio_postgres::Config::from(self.db_info); + if let Some(scram_keys) = self.scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys)); + } + // TODO: establish a secure connection to the DB + let (client, conn) = config.connect_raw(&mut socket, NoTls).await?; let version = conn .parameter("server_version") .ok_or(ConnectionError::FailedToFetchPgVersion)? @@ -78,21 +69,3 @@ impl DatabaseInfo { Ok((socket, version, cancel_closure)) } } - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index aef079d089..077a07beb9 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,65 +1,47 @@ -use anyhow::{bail, ensure, Context}; -use std::net::SocketAddr; -use std::str::FromStr; -use std::sync::Arc; - -pub type TlsConfig = Arc; +use anyhow::{ensure, Context}; +use std::{str::FromStr, sync::Arc}; #[non_exhaustive] -pub enum ClientAuthMethod { - Password, +pub enum AuthBackendType { + LegacyConsole, + Console, + Postgres, Link, - - /// Use password auth only if username ends with "@zenith" - Mixed, } -pub enum RouterConfig { - Static { host: String, port: u16 }, - Dynamic(ClientAuthMethod), -} - -impl FromStr for ClientAuthMethod { +impl FromStr for AuthBackendType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - use ClientAuthMethod::*; + println!("ClientAuthMethod::from_str: '{}'", s); + use AuthBackendType::*; match s { - "password" => Ok(Password), + "legacy" => Ok(LegacyConsole), + "console" => Ok(Console), + "postgres" => Ok(Postgres), "link" => Ok(Link), - "mixed" => Ok(Mixed), - _ => bail!("Invalid option for router: `{}`", s), + _ => Err(anyhow::anyhow!("Invlid option for auth method")), } } } pub struct ProxyConfig { - /// main entrypoint for users to connect to - pub proxy_address: SocketAddr, + /// TLS configuration for the proxy. + pub tls_config: Option, - /// method of assigning compute nodes - pub router_config: RouterConfig, + pub auth_backend: AuthBackendType, - /// internally used for status and prometheus metrics - pub http_address: SocketAddr, - - /// management endpoint. Upon user account creation control plane - /// will notify us here, so that we can 'unfreeze' user session. - /// TODO It uses postgres protocol over TCP but should be migrated to http. - pub mgmt_address: SocketAddr, - - /// send unauthenticated users to this URI - pub redirect_uri: String, - - /// control plane address where we would check auth. pub auth_endpoint: reqwest::Url, - pub tls_config: Option, + pub auth_link_uri: reqwest::Url, } -pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result { +pub type TlsConfig = Arc; + +/// Configure TLS for the main endpoint. +pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { - let key_bytes = std::fs::read(key_path).context("SSL key file")?; + let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) .context("couldn't read TLS keys")?; @@ -68,7 +50,7 @@ pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result = Default::default(); -} - -/// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter( - psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await -} - -pub fn notify( - psql_session_id: &str, - msg: Result, -) -> Result<(), waiters::NotifyError> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} - -/// Zenith console API wrapper. -pub struct CPlaneApi { - auth_endpoint: reqwest::Url, -} - -impl CPlaneApi { - pub fn new(auth_endpoint: reqwest::Url) -> Self { - Self { auth_endpoint } - } -} - -#[derive(Debug, Error)] -pub enum AuthErrorImpl { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error("Console responded with a malformed JSON: {0}")] - MalformedResponse(#[from] serde_json::Error), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error(transparent)] - WaiterRegister(#[from] waiters::RegisterError), - - #[error(transparent)] - WaiterWait(#[from] waiters::WaitError), -} - -#[derive(Debug, Error)] -#[error(transparent)] -pub struct AuthError(Box); - -impl AuthError { - /// Smart constructor for authentication error reported by `mgmt`. - pub fn auth_failed(msg: impl Into) -> Self { - AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) - } -} - -impl From for AuthError -where - AuthErrorImpl: From, -{ - fn from(e: T) -> Self { - AuthError(Box::new(e.into())) - } -} - -impl UserFacingError for AuthError { - fn to_string_client(&self) -> String { - use AuthErrorImpl::*; - match self.0.as_ref() { - AuthFailed(_) | HttpStatus(_) => self.to_string(), - _ => "Internal error".to_string(), - } - } -} - -impl CPlaneApi { - pub async fn authenticate_proxy_client( - &self, - creds: ClientCredentials, - md5_response: &str, - salt: &[u8; 4], - psql_session_id: &str, - ) -> Result { - let mut url = self.auth_endpoint.clone(); - url.query_pairs_mut() - .append_pair("login", &creds.user) - .append_pair("database", &creds.dbname) - .append_pair("md5response", md5_response) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - with_waiter(psql_session_id, |waiter| async { - println!("cplane request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); - } - - let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); - - use ProxyAuthResponse::*; - let db_info = match auth_info { - Ready { conn_info } => conn_info, - Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), - NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, - }; - - Ok(db_info) - }) - .await - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } -} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 8df46619ec..fc2a368b85 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -5,35 +5,28 @@ //! in somewhat transparent manner (again via communication with control plane API). mod auth; +mod auth_backend; mod cancellation; mod compute; mod config; -mod cplane_api; mod error; mod http; mod mgmt; +mod parse; mod proxy; +mod sasl; +mod scram; mod stream; mod waiters; -// Currently SCRAM is only used in tests -#[cfg(test)] -mod parse; -#[cfg(test)] -mod sasl; -#[cfg(test)] -mod scram; - use anyhow::{bail, Context}; use clap::{App, Arg}; use config::ProxyConfig; use futures::FutureExt; -use std::future::Future; +use std::{future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::GIT_VERSION; -use crate::config::{ClientAuthMethod, RouterConfig}; - /// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, @@ -44,7 +37,7 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { metrics::set_common_metrics_prefix("zenith_proxy"); - let arg_matches = App::new("Zenith proxy/router") + let arg_matches = App::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("proxy") @@ -55,18 +48,11 @@ async fn main() -> anyhow::Result<()> { .default_value("127.0.0.1:4432"), ) .arg( - Arg::new("auth-method") - .long("auth-method") + Arg::new("auth-backend") + .long("auth-backend") .takes_value(true) - .help("Possible values: password | link | mixed") - .default_value("mixed"), - ) - .arg( - Arg::new("static-router") - .short('s') - .long("static-router") - .takes_value(true) - .help("Route all clients to host:port"), + .help("Possible values: legacy | console | postgres | link") + .default_value("legacy"), ) .arg( Arg::new("mgmt") @@ -89,7 +75,7 @@ async fn main() -> anyhow::Result<()> { .short('u') .long("uri") .takes_value(true) - .help("redirect unauthenticated users to given uri") + .help("redirect unauthenticated users to the given uri in case of link auth") .default_value("http://localhost:3000/psql_session/"), ) .arg( @@ -97,77 +83,68 @@ async fn main() -> anyhow::Result<()> { .short('a') .long("auth-endpoint") .takes_value(true) - .help("API endpoint for authenticating users") + .help("cloud API endpoint for authenticating users") .default_value("http://localhost:3000/authenticate_proxy_request/"), ) .arg( - Arg::new("ssl-key") + Arg::new("tls-key") .short('k') - .long("ssl-key") + .long("tls-key") + .alias("ssl-key") // backwards compatibility .takes_value(true) - .help("path to SSL key for client postgres connections"), + .help("path to TLS key for client postgres connections"), ) .arg( - Arg::new("ssl-cert") + Arg::new("tls-cert") .short('c') - .long("ssl-cert") + .long("tls-cert") + .alias("ssl-cert") // backwards compatibility .takes_value(true) - .help("path to SSL cert for client postgres connections"), + .help("path to TLS cert for client postgres connections"), ) .get_matches(); let tls_config = match ( - arg_matches.value_of("ssl-key"), - arg_matches.value_of("ssl-cert"), + arg_matches.value_of("tls-key"), + arg_matches.value_of("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_ssl(key_path, cert_path)?), + (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), (None, None) => None, - _ => bail!("either both or neither ssl-key and ssl-cert must be specified"), + _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - let auth_method = arg_matches.value_of("auth-method").unwrap().parse()?; - let router_config = match arg_matches.value_of("static-router") { - None => RouterConfig::Dynamic(auth_method), - Some(addr) => { - if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(':').unwrap(); - RouterConfig::Static { - host: host.to_string(), - port: port.parse().unwrap(), - } - } else { - bail!("static-router requires --auth-method password") - } - } - }; + let proxy_address: SocketAddr = arg_matches.value_of("proxy").unwrap().parse()?; + let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; + let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { - router_config, - proxy_address: arg_matches.value_of("proxy").unwrap().parse()?, - mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?, - http_address: arg_matches.value_of("http").unwrap().parse()?, - redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, tls_config, + auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, })); println!("Version: {}", GIT_VERSION); // Check that we can bind to address before further initialization - println!("Starting http on {}", config.http_address); - let http_listener = TcpListener::bind(config.http_address).await?.into_std()?; + println!("Starting http on {}", http_address); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; - println!("Starting mgmt on {}", config.mgmt_address); - let mgmt_listener = TcpListener::bind(config.mgmt_address).await?.into_std()?; + println!("Starting mgmt on {}", mgmt_address); + let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; - println!("Starting proxy on {}", config.proxy_address); - let proxy_listener = TcpListener::bind(config.proxy_address).await?; + println!("Starting proxy on {}", proxy_address); + let proxy_listener = TcpListener::bind(proxy_address).await?; - let http = tokio::spawn(http::thread_main(http_listener)); - let proxy = tokio::spawn(proxy::thread_main(config, proxy_listener)); - let mgmt = tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)); + let tasks = [ + tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(proxy::thread_main(config, proxy_listener)), + tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), + ] + .map(flatten_err); - let tasks = [flatten_err(http), flatten_err(proxy), flatten_err(mgmt)]; + // This will block until all tasks have completed. + // Furthermore, the first one to fail will cancel the rest. let _: Vec<()> = futures::future::try_join_all(tasks).await?; Ok(()) diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 23ad8a2013..93618fff68 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::{compute::DatabaseInfo, cplane_api}; +use crate::auth_backend; use anyhow::Context; use serde::Deserialize; use std::{ @@ -10,6 +10,8 @@ use utils::{ pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, }; +/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend + /// /// Main proxy listener loop. /// @@ -75,12 +77,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(DatabaseInfo), + Success(auth_backend::console::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -111,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f7de1618df..821ce377f5 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -73,7 +73,7 @@ pub async fn thread_main( async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, - stream: impl AsyncRead + AsyncWrite + Unpin, + stream: impl AsyncRead + AsyncWrite + Unpin + Send, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); @@ -144,10 +144,15 @@ async fn handshake( } // Here and forth: `or_else` demands that we use a future here - let creds = async { params.try_into() } + let mut creds: auth::ClientCredentials = async { params.try_into() } .or_else(|e| stream.throw_error(e)) .await?; + // Set SNI info when available + if let Stream::Tls { tls } = stream.get_ref() { + creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned()); + } + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { @@ -174,7 +179,7 @@ impl Client { } } -impl Client { +impl Client { /// Let the client authenticate and connect to the designated compute node. async fn connect_to_db( self, @@ -185,10 +190,10 @@ impl Client { // Authenticate and connect to a compute node. let auth = creds.authenticate(config, &mut stream).await; - let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; + let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let (db, version, cancel_closure) = - db_info.connect().or_else(|e| stream.throw_error(e)).await?; + node.connect().or_else(|e| stream.throw_error(e)).await?; let cancel_key_data = session.enable_cancellation(cancel_closure); stream diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 70a4d9946a..cd9032bfb9 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -39,9 +39,20 @@ pub enum Error { /// A convenient result type for SASL exchange. pub type Result = std::result::Result; +/// A result of one SASL exchange. +pub enum Step { + /// We should continue exchanging messages. + Continue(T), + /// The client has been authenticated successfully. + Authenticated(R), +} + /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. pub trait Mechanism: Sized { + /// What's produced as a result of successful authentication. + type Output; + /// Produce a server challenge to be sent to the client. /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). - fn exchange(self, input: &str) -> Result<(Option, String)>; + fn exchange(self, input: &str) -> Result<(Step, String)>; } diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 58be6268fe..f48aee4f26 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -49,6 +49,7 @@ impl<'a> ServerMessage<&'a str> { }) } } + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index 03649b8d11..0e782c5f29 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -51,18 +51,23 @@ impl SaslStream<'_, S> { impl SaslStream<'_, S> { /// Perform SASL message exchange according to the underlying algorithm /// until user is either authenticated or denied access. - pub async fn authenticate(mut self, mut mechanism: impl Mechanism) -> super::Result<()> { + pub async fn authenticate( + mut self, + mut mechanism: M, + ) -> super::Result { loop { let input = self.recv().await?; let (moved, reply) = mechanism.exchange(input)?; + + use super::Step::*; match moved { - Some(moved) => { + Continue(moved) => { self.send(&ServerMessage::Continue(&reply)).await?; mechanism = moved; } - None => { + Authenticated(result) => { self.send(&ServerMessage::Final(&reply)).await?; - return Ok(()); + return Ok(result); } } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 44671084ee..7cc4191435 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -9,14 +9,16 @@ mod exchange; mod key; mod messages; -mod password; mod secret; mod signature; -pub use secret::*; +#[cfg(test)] +mod password; pub use exchange::Exchange; +pub use key::ScramKey; pub use secret::ServerSecret; +pub use secret::*; use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 5a986b965a..cad77e15f5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -8,7 +8,6 @@ use super::signature::SignatureBuilder; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. -#[derive(Debug)] struct TlsServerEndPoint; impl std::fmt::Display for TlsServerEndPoint { @@ -28,7 +27,6 @@ impl std::str::FromStr for TlsServerEndPoint { } } -#[derive(Debug)] enum ExchangeState { /// Waiting for [`ClientFirstMessage`]. Initial, @@ -41,7 +39,6 @@ enum ExchangeState { } /// Server's side of SCRAM auth algorithm. -#[derive(Debug)] pub struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, @@ -65,8 +62,10 @@ impl<'a> Exchange<'a> { } impl sasl::Mechanism for Exchange<'_> { - fn exchange(mut self, input: &str) -> sasl::Result<(Option, String)> { - use ExchangeState::*; + type Output = super::ScramKey; + + fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step, String)> { + use {sasl::Step::*, ExchangeState::*}; match &self.state { Initial => { let client_first_message = @@ -85,7 +84,7 @@ impl sasl::Mechanism for Exchange<'_> { server_first_message, }; - Ok((Some(self), msg)) + Ok((Continue(self), msg)) } SaltSent { cbind_flag, @@ -127,7 +126,7 @@ impl sasl::Mechanism for Exchange<'_> { let msg = client_final_message .build_server_final_message(signature_builder, &self.secret.server_key); - Ok((None, msg)) + Ok((Authenticated(client_key), msg)) } } } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 1c13471bc3..e9c65fcef3 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the [password](super::password::SaltedPassword). /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Default, Debug, PartialEq, Eq)] +#[derive(Default, PartialEq, Eq)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], @@ -16,6 +16,10 @@ impl ScramKey { pub fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() } + + pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { + self.bytes + } } impl From<[u8; SCRAM_KEY_LEN]> for ScramKey { diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index e8d180bcdd..765aef4443 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -5,7 +5,6 @@ use super::key::ScramKey; /// Server secret is produced from [password](super::password::SaltedPassword) /// and is used throughout the authentication process. -#[derive(Debug)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, @@ -39,6 +38,7 @@ impl ServerSecret { /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. + #[allow(dead_code)] pub fn mock(user: &str, nonce: &[u8; 32]) -> Self { // Refer to `auth-scram.c : scram_mock_salt`. let mocked_salt = super::sha256([user.as_bytes(), nonce]); diff --git a/pyproject.toml b/pyproject.toml index 7dbdcc0304..335c6d61d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ boto3 = "^1.20.40" boto3-stubs = "^1.20.40" moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" +pytest-lazy-fixture = "^0.6.3" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 8a31311b8f..5e1ceee02e 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -24,18 +24,17 @@ walkdir = "2" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } -serde_with = {version = "1.12.0"} +serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -etcd-client = "0.8.3" tokio-util = { version = "0.7", features = ["io"] } -rusoto_core = "0.47" -rusoto_s3 = "0.47" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } +etcd_broker = { path = "../libs/etcd_broker" } +remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3fea3581a8..7e979840c2 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -109,6 +109,12 @@ fn main() -> Result<()> { .takes_value(true) .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), ) + .arg( + Arg::new("broker-etcd-prefix") + .long("broker-etcd-prefix") + .takes_value(true) + .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -118,7 +124,7 @@ fn main() -> Result<()> { return Ok(()); } - let mut conf: SafeKeeperConf = Default::default(); + let mut conf = SafeKeeperConf::default(); if let Some(dir) = arg_matches.value_of("datadir") { // change into the data directory. @@ -162,6 +168,9 @@ fn main() -> Result<()> { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); conf.broker_endpoints = Some(collected_ep?); } + if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { + conf.broker_etcd_prefix = prefix.to_string(); + } start_safekeeper(conf, given_id, arg_matches.is_present("init")) } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 8ce7bdf0e5..c9ae1a8d98 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,61 +1,22 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. -use anyhow::bail; use anyhow::Context; use anyhow::Error; use anyhow::Result; -use etcd_client::Client; -use etcd_client::EventType; -use etcd_client::PutOptions; -use etcd_client::WatchOptions; -use lazy_static::lazy_static; -use regex::Regex; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; -use std::str::FromStr; +use etcd_broker::Client; +use etcd_broker::PutOptions; +use etcd_broker::SkTimelineSubscriptionKind; use std::time::Duration; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; -use crate::{safekeeper::Term, timeline::GlobalTimelines, SafeKeeperConf}; -use utils::{ - lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, -}; +use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use utils::zid::{ZNodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; const LEASE_TTL_SEC: i64 = 5; -// TODO: add global zenith installation ID. -const ZENITH_PREFIX: &str = "zenith"; - -/// Published data about safekeeper. Fields made optional for easy migrations. -#[serde_as] -#[derive(Debug, Deserialize, Serialize)] -pub struct SafekeeperInfo { - /// Term of the last entry. - pub last_log_term: Option, - /// LSN of the last record. - #[serde_as(as = "Option")] - #[serde(default)] - pub flush_lsn: Option, - /// Up to which LSN safekeeper regards its WAL as committed. - #[serde_as(as = "Option")] - #[serde(default)] - pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. - #[serde_as(as = "Option")] - #[serde(default)] - pub s3_wal_lsn: Option, - /// LSN of last checkpoint uploaded by pageserver. - #[serde_as(as = "Option")] - #[serde(default)] - pub remote_consistent_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub peer_horizon_lsn: Option, -} pub fn thread_main(conf: SafeKeeperConf) { let runtime = runtime::Builder::new_current_thread() @@ -71,22 +32,21 @@ pub fn thread_main(conf: SafeKeeperConf) { }); } -/// Prefix to timeline related data. -fn timeline_path(zttid: &ZTenantTimelineId) -> String { +/// Key to per timeline per safekeeper data. +fn timeline_safekeeper_path( + broker_prefix: String, + zttid: ZTenantTimelineId, + sk_id: ZNodeId, +) -> String { format!( - "{}/{}/{}", - ZENITH_PREFIX, zttid.tenant_id, zttid.timeline_id + "{}/{sk_id}", + SkTimelineSubscriptionKind::timeline(broker_prefix, zttid).watch_key() ) } -/// Key to per timeline per safekeeper data. -fn timeline_safekeeper_path(zttid: &ZTenantTimelineId, sk_id: ZNodeId) -> String { - format!("{}/safekeeper/{}", timeline_path(zttid), sk_id) -} - /// Push once in a while data about all active timelines to the broker. -async fn push_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; +async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; // Get and maintain lease to automatically delete obsolete data let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; @@ -98,14 +58,17 @@ async fn push_loop(conf: SafeKeeperConf) -> Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - let sk_info = tli.get_public_info(); + for zttid in GlobalTimelines::get_active_timelines() { + if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + let sk_info = tli.get_public_info()?; let put_opts = PutOptions::new().with_lease(lease.id()); client .put( - timeline_safekeeper_path(zttid, conf.my_id), + timeline_safekeeper_path( + conf.broker_etcd_prefix.clone(), + zttid, + conf.my_id, + ), serde_json::to_string(&sk_info)?, Some(put_opts), ) @@ -128,45 +91,31 @@ async fn push_loop(conf: SafeKeeperConf) -> Result<()> { /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - lazy_static! { - static ref TIMELINE_SAFEKEEPER_RE: Regex = - Regex::new(r"^zenith/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$") - .unwrap(); - } - let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; - loop { - let wo = WatchOptions::new().with_prefix(); - // TODO: subscribe only to my timelines - let (_, mut stream) = client.watch(ZENITH_PREFIX, Some(wo)).await?; - while let Some(resp) = stream.message().await? { - if resp.canceled() { - bail!("watch canceled"); - } + let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; - for event in resp.events() { - if EventType::Put == event.event_type() { - if let Some(kv) = event.kv() { - if let Some(caps) = TIMELINE_SAFEKEEPER_RE.captures(kv.key_str()?) { - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let zttid = ZTenantTimelineId::new(tenant_id, timeline_id); - let safekeeper_id = ZNodeId(caps.get(3).unwrap().as_str().parse()?); - let value_str = kv.value_str()?; - match serde_json::from_str::(value_str) { - Ok(safekeeper_info) => { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - tli.record_safekeeper_info(&safekeeper_info, safekeeper_id)? - } - } - Err(err) => warn!( - "failed to deserialize safekeeper info {}: {}", - value_str, err - ), - } + let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + &mut client, + SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()), + ) + .await + .context("failed to subscribe for safekeeper info")?; + + loop { + match subscription.fetch_data().await { + Some(new_info) => { + for (zttid, sk_info) in new_info { + // note: there are blocking operations below, but it's considered fine for now + if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + for (safekeeper_id, info) in sk_info { + tli.record_safekeeper_info(&info, safekeeper_id)? } } } } + None => { + debug!("timeline updates sender closed, aborting the pull loop"); + return Ok(()); + } } } } diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 0cb14298cb..22716de1a0 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -103,6 +103,43 @@ pub struct SafeKeeperStateV3 { pub wal_start_lsn: Lsn, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV4 { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealed with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: Peers, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -125,6 +162,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result wal_seg_size: oldstate.server.wal_seg_size, }, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, @@ -146,6 +185,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, @@ -167,12 +208,37 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); + // migrate to having timeline_start_lsn + } else if version == 4 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV4::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.tenant_id, + timeline_id: oldstate.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.peer_horizon_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), + }); } bail!("unsupported safekeeper control file version {}", version) } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index fab8724430..e731db5617 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use etcd_broker::SkTimelineInfo; use hyper::{Body, Request, Response, StatusCode}; use serde::Serialize; @@ -5,7 +6,6 @@ use serde::Serializer; use std::fmt::Display; use std::sync::Arc; -use crate::broker::SafekeeperInfo; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -69,6 +69,10 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + timeline_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + local_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] s3_wal_lsn: Lsn, @@ -102,6 +106,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result Result<()> { let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 1, // current protocol + protocol_version: 2, // current protocol pg_version: 0, // unknown proposer_id: [0u8; 16], system_id: 0, @@ -124,6 +124,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L term, start_streaming_at: lsn, term_history: history, + timeline_start_lsn: Lsn(0), }); spg.timeline.get().process_msg(&proposer_elected_request)?; @@ -238,13 +239,13 @@ fn encode_logical_message(prefix: &str, message: &str) -> Vec { xl_crc: 0, // crc will be calculated later }; - let header_bytes = header.encode(); + let header_bytes = header.encode().expect("failed to encode header"); let crc = crc32c_append(0, &data); let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); header.xl_crc = crc; let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode()); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); wal.extend_from_slice(&data); // WAL start position must be aligned at 8 bytes, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 6509e8166a..f74e5be992 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -27,6 +27,7 @@ pub mod defaults { pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + pub const DEFAULT_NEON_BROKER_PREFIX: &str = "neon"; pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); @@ -51,6 +52,7 @@ pub struct SafeKeeperConf { pub recall_period: Duration, pub my_id: ZNodeId, pub broker_endpoints: Option>, + pub broker_etcd_prefix: String, } impl SafeKeeperConf { @@ -76,6 +78,7 @@ impl Default for SafeKeeperConf { recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: ZNodeId(0), broker_endpoints: None, + broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), } } } diff --git a/safekeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs index c796f53615..2851c0b8a0 100644 --- a/safekeeper/src/s3_offload.rs +++ b/safekeeper/src/s3_offload.rs @@ -1,20 +1,23 @@ // // Offload old WAL segments to S3 and remove them locally +// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set +// if no IAM bucket access is used. // -use anyhow::Context; +use anyhow::{bail, Context}; use postgres_ffi::xlog_utils::*; -use rusoto_core::credential::StaticProvider; -use rusoto_core::{HttpClient, Region}; -use rusoto_s3::{ListObjectsV2Request, PutObjectRequest, S3Client, StreamingBody, S3}; +use remote_storage::{ + GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey, +}; use std::collections::HashSet; use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; use std::path::Path; use std::time::SystemTime; use tokio::fs::{self, File}; +use tokio::io::BufReader; use tokio::runtime; use tokio::time::sleep; -use tokio_util::io::ReaderStream; use tracing::*; use walkdir::WalkDir; @@ -39,9 +42,8 @@ pub fn thread_main(conf: SafeKeeperConf) { } async fn offload_files( - client: &S3Client, - bucket_name: &str, - listing: &HashSet, + remote_storage: &S3Bucket, + listing: &HashSet, dir_path: &Path, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -55,17 +57,12 @@ async fn offload_files( && IsXLogFileName(entry.file_name().to_str().unwrap()) && entry.metadata().unwrap().created().unwrap() <= horizon { - let relpath = path.strip_prefix(&conf.workdir).unwrap(); - let s3path = String::from("walarchive/") + relpath.to_str().unwrap(); - if !listing.contains(&s3path) { + let remote_path = remote_storage.remote_object_id(path)?; + if !listing.contains(&remote_path) { let file = File::open(&path).await?; - client - .put_object(PutObjectRequest { - body: Some(StreamingBody::new(ReaderStream::new(file))), - bucket: bucket_name.to_string(), - key: s3path, - ..PutObjectRequest::default() - }) + let file_length = file.metadata().await?.len() as usize; + remote_storage + .upload(BufReader::new(file), file_length, &remote_path, None) .await?; fs::remove_file(&path).await?; @@ -77,58 +74,34 @@ async fn offload_files( } async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let region = Region::Custom { - name: env::var("S3_REGION").context("S3_REGION env var is not set")?, - endpoint: env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?, + let remote_storage = match GenericRemoteStorage::new( + conf.workdir.clone(), + &RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), + max_sync_errors: NonZeroU32::new(1).unwrap(), + storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { + bucket_name: "zenith-testbucket".to_string(), + bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, + prefix_in_bucket: Some("walarchive/".to_string()), + endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), + concurrency_limit: NonZeroUsize::new(20).unwrap(), + }), + }, + )? { + GenericRemoteStorage::Local(_) => { + bail!("Unexpected: got local storage for the remote config") + } + GenericRemoteStorage::S3(remote_storage) => remote_storage, }; - let client = S3Client::new_with( - HttpClient::new().context("Failed to create S3 http client")?, - StaticProvider::new_minimal( - env::var("S3_ACCESSKEY").context("S3_ACCESSKEY env var is not set")?, - env::var("S3_SECRET").context("S3_SECRET env var is not set")?, - ), - region, - ); - - let bucket_name = "zenith-testbucket"; - loop { - let listing = gather_wal_entries(&client, bucket_name).await?; - let n = offload_files(&client, bucket_name, &listing, &conf.workdir, conf).await?; - info!("Offload {} files to S3", n); + let listing = remote_storage + .list() + .await? + .into_iter() + .collect::>(); + let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; + info!("Offload {n} files to S3"); sleep(conf.ttl.unwrap()).await; } } - -async fn gather_wal_entries( - client: &S3Client, - bucket_name: &str, -) -> anyhow::Result> { - let mut document_keys = HashSet::new(); - - let mut continuation_token = None::; - loop { - let response = client - .list_objects_v2(ListObjectsV2Request { - bucket: bucket_name.to_string(), - prefix: Some("walarchive/".to_string()), - continuation_token, - ..ListObjectsV2Request::default() - }) - .await?; - document_keys.extend( - response - .contents - .unwrap_or_default() - .into_iter() - .filter_map(|o| o.key), - ); - - continuation_token = response.continuation_token; - if continuation_token.is_none() { - break; - } - } - Ok(document_keys) -} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 048753152b..b9264565dc 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,6 +4,7 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use etcd_broker::SkTimelineInfo; use postgres_ffi::xlog_utils::TimeLineID; use postgres_ffi::xlog_utils::XLogSegNo; @@ -16,7 +17,6 @@ use tracing::*; use lazy_static::lazy_static; -use crate::broker::SafekeeperInfo; use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; @@ -30,8 +30,8 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 4; -const SK_PROTOCOL_VERSION: u32 = 1; +pub const SK_FORMAT_VERSION: u32 = 5; +const SK_PROTOCOL_VERSION: u32 = 2; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. @@ -52,7 +52,7 @@ impl TermHistory { } // Parse TermHistory as n_entries followed by TermSwitchEntry pairs - pub fn from_bytes(mut bytes: Bytes) -> Result { + pub fn from_bytes(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -183,6 +183,13 @@ pub struct SafeKeeperState { /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, @@ -231,6 +238,8 @@ impl SafeKeeperState { wal_seg_size: 0, }, proposer_uuid: [0; 16], + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: Lsn(0), s3_wal_lsn: Lsn(0), peer_horizon_lsn: Lsn(0), @@ -268,6 +277,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, + node_id: ZNodeId, } /// Vote request sent from proposer to safekeepers @@ -286,6 +296,7 @@ pub struct VoteResponse { flush_lsn: Lsn, truncate_lsn: Lsn, term_history: TermHistory, + timeline_start_lsn: Lsn, } /* @@ -297,6 +308,7 @@ pub struct ProposerElected { pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, + pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -387,10 +399,15 @@ impl ProposerAcceptorMessage { } let term = msg_bytes.get_u64_le(); let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(msg_bytes)?; + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let timeline_start_lsn = msg_bytes.get_u64_le().into(); let msg = ProposerElected { term, start_streaming_at, + timeline_start_lsn, term_history, }; Ok(ProposerAcceptorMessage::Elected(msg)) @@ -437,6 +454,7 @@ impl AcceptorProposerMessage { AcceptorProposerMessage::Greeting(msg) => { buf.put_u64_le('g' as u64); buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); } AcceptorProposerMessage::VoteResponse(msg) => { buf.put_u64_le('v' as u64); @@ -449,6 +467,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(e.term); buf.put_u64_le(e.lsn.into()); } + buf.put_u64_le(msg.timeline_start_lsn.into()); } AcceptorProposerMessage::AppendResponse(msg) => { buf.put_u64_le('a' as u64); @@ -511,6 +530,8 @@ pub struct SafeKeeper { pub state: CTRL, // persistent state storage pub wal_store: WAL, + + node_id: ZNodeId, // safekeeper's node id } impl SafeKeeper @@ -523,6 +544,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, + node_id: ZNodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -544,6 +566,7 @@ where }, state, wal_store, + node_id, }) } @@ -635,6 +658,7 @@ where ); Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { term: self.state.acceptor_state.term, + node_id: self.node_id, }))) } @@ -650,6 +674,7 @@ where flush_lsn: self.wal_store.flush_lsn(), truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), + timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.clone(); @@ -705,6 +730,23 @@ where // and now adopt term history from proposer { let mut state = self.state.clone(); + + // Remeber point where WAL begins globally, if not yet. + if state.timeline_start_lsn == Lsn(0) { + state.timeline_start_lsn = msg.timeline_start_lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } + + // Remember point where WAL begins locally, if not yet. (I doubt the + // second condition is ever possible) + if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { + state.local_start_lsn = msg.start_streaming_at; + info!("setting local_start_lsn to {:?}", state.local_start_lsn); + } + state.acceptor_state.term_history = msg.term_history.clone(); self.state.persist(&state)?; } @@ -844,7 +886,7 @@ where } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperInfo) -> Result<()> { + pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> { let mut sync_control_file = false; if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) { @@ -896,9 +938,7 @@ where ), self.state.s3_wal_lsn, ); - let res = horizon_lsn.segment_number(self.state.server.wal_seg_size as usize); - info!("horizon is {}, res {}", horizon_lsn, res); - res + horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } @@ -968,7 +1008,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -983,7 +1023,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store).unwrap(); + sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1000,7 +1040,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, @@ -1023,6 +1063,7 @@ mod tests { term: 1, lsn: Lsn(3), }]), + timeline_start_lsn: Lsn(0), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .unwrap(); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4a507015d3..140d6660ac 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,6 +3,7 @@ use anyhow::{bail, Context, Result}; +use etcd_broker::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; @@ -21,7 +22,6 @@ use utils::{ zid::{ZNodeId, ZTenantTimelineId}, }; -use crate::broker::SafekeeperInfo; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; @@ -89,6 +89,7 @@ struct SharedState { active: bool, num_computes: u32, pageserver_connstr: Option, + listen_pg_addr: String, last_removed_segno: XLogSegNo, } @@ -102,7 +103,7 @@ impl SharedState { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?; + let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; Ok(Self { notified_commit_lsn: Lsn(0), @@ -111,6 +112,7 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, + listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -125,11 +127,12 @@ impl SharedState { Ok(Self { notified_commit_lsn: Lsn(0), - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?, + sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), active: false, num_computes: 0, pageserver_connstr: None, + listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -418,9 +421,9 @@ impl Timeline { } /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self) -> SafekeeperInfo { + pub fn get_public_info(&self) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); - SafekeeperInfo { + Ok(SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost @@ -432,11 +435,23 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - } + wal_stream_connection_string: shared_state + .pageserver_connstr + .as_deref() + .map(|pageserver_connstr| { + wal_stream_connection_string( + self.zttid, + &shared_state.listen_pg_addr, + pageserver_connstr, + ) + }) + .transpose() + .context("Failed to get the pageserver callmemaybe connstr")?, + }) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SafekeeperInfo, _sk_id: ZNodeId) -> Result<()> { + pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.sk.record_safekeeper_info(sk_info)?; self.notify_wal_senders(&mut shared_state); @@ -469,6 +484,10 @@ impl Timeline { let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet, no WAL exists. + if shared_state.sk.state.server.wal_seg_size == 0 { + return Ok(()); + } horizon_segno = shared_state.sk.get_horizon_segno(); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { @@ -485,6 +504,29 @@ impl Timeline { } } +// pageserver connstr is needed to be able to distinguish between different pageservers +// it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved +// TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, + pageserver_connstr: &str, +) -> anyhow::Result { + let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); + let me_conf = me_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{me_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", + )) +} + // Utilities needed by various Connection-like objects pub trait TimelineTools { fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index aeb45348ad..d6b073492d 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,11 +1,9 @@ -import subprocess -import asyncio from contextlib import closing import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverApiException # @@ -33,6 +31,10 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): 'compaction_target_size': '4194304', }) + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + pscur.execute("failpoints flush-frozen=sleep(10000)") + env.zenith_cli.create_timeline(f'main', tenant_id=tenant) pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() @@ -116,3 +118,17 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) + + +def test_ancestor_branch_detach(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + parent_timeline_id = env.zenith_cli.create_branch("test_ancestor_branch_detach_parent", "empty") + + env.zenith_cli.create_branch("test_ancestor_branch_detach_branch1", + "test_ancestor_branch_detach_parent") + + ps_http = env.pageserver.http_client() + with pytest.raises(ZenithPageserverApiException, + match="Failed to detach inmem tenant timeline"): + ps_http.timeline_detach(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 88937fa0dc..24898be70a 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -32,7 +32,16 @@ def test_createdb(zenith_simple_env: ZenithEnv): # Test that you can connect to the new database on both branches for db in (pg, pg2): - db.connect(dbname='foodb').close() + with closing(db.connect(dbname='foodb')) as conn: + with conn.cursor() as cur: + # Check database size in both branches + cur.execute( + 'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;', + ('foodb', )) + res = cur.fetchone() + # check that dbsize equals sum of all relation sizes, excluding shared ones + # This is how we define dbsize in zenith for now + assert res[0] == res[1] # diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py new file mode 100644 index 0000000000..37113b46f2 --- /dev/null +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -0,0 +1,84 @@ +from contextlib import closing +from datetime import timedelta, timezone, tzinfo +import math +from uuid import UUID +import psycopg2.extras +import psycopg2.errors +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.log_helper import log +import time + + +# +# Test pageserver get_lsn_by_timestamp API +# +def test_lsn_mapping(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init_start() + + new_timeline_id = env.zenith_cli.create_branch('test_lsn_mapping') + pgmain = env.postgres.create_start("test_lsn_mapping") + log.info("postgres is running on 'test_lsn_mapping' branch") + + ps_conn = env.pageserver.connect() + ps_cur = ps_conn.cursor() + conn = pgmain.connect() + cur = conn.cursor() + + # Create table, and insert rows, each in a separate transaction + # Disable synchronous_commit to make this initialization go faster. + # + # Each row contains current insert LSN and the current timestamp, when + # the row was inserted. + cur.execute("SET synchronous_commit=off") + cur.execute("CREATE TABLE foo (x integer)") + tbl = [] + for i in range(1000): + cur.execute(f"INSERT INTO foo VALUES({i})") + cur.execute(f'SELECT clock_timestamp()') + # Get the timestamp at UTC + after_timestamp = cur.fetchone()[0].replace(tzinfo=None) + tbl.append([i, after_timestamp]) + + # Execute one more transaction with synchronous_commit enabled, to flush + # all the previous transactions + cur.execute("SET synchronous_commit=on") + cur.execute("INSERT INTO foo VALUES (-1)") + + # Check edge cases: timestamp in the future + probe_timestamp = tbl[-1][1] + timedelta(hours=1) + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + result = ps_cur.fetchone()[0] + assert result == 'future' + + # timestamp too the far history + probe_timestamp = tbl[0][1] - timedelta(hours=10) + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + result = ps_cur.fetchone()[0] + assert result == 'past' + + # Probe a bunch of timestamps in the valid range + for i in range(1, len(tbl), 100): + probe_timestamp = tbl[i][1] + + # Call get_lsn_by_timestamp to get the LSN + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + lsn = ps_cur.fetchone()[0] + + # Launch a new read-only node at that LSN, and check that only the rows + # that were supposed to be committed at that point in time are visible. + pg_here = env.postgres.create_start(branch_name='test_lsn_mapping', + node_name='test_lsn_mapping_read', + lsn=lsn) + with closing(pg_here.connect()) as conn_here: + with conn_here.cursor() as cur_here: + cur_here.execute("SELECT max(x) FROM foo") + assert cur_here.fetchone()[0] == i + + pg_here.stop_and_destroy() diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py new file mode 100644 index 0000000000..dbfa943a7a --- /dev/null +++ b/test_runner/batch_others/test_recovery.py @@ -0,0 +1,64 @@ +import os +import time +import psycopg2.extras +import json +from ast import Assert +from contextlib import closing +from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log + + +# +# Test pageserver recovery after crash +# +def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + # Override default checkpointer settings to run it more often + zenith_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" + + env = zenith_env_builder.init() + + # Check if failpoints enables. Otherwise the test doesn't make sense + f = env.zenith_cli.pageserver_enabled_features() + + assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" + zenith_env_builder.start() + + # Create a branch for us + env.zenith_cli.create_branch("test_pageserver_recovery", "main") + + pg = env.postgres.create_start('test_pageserver_recovery') + log.info("postgres is running on 'test_pageserver_recovery' branch") + + connstr = pg.connstr() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + + # Sleep for some time to let checkpoint create image layers + time.sleep(2) + + # Configure failpoints + pscur.execute( + "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic") + + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Excepted server crash {err}") + + log.info("Wait before server restart") + env.pageserver.stop() + env.pageserver.start() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + assert cur.fetchone() == (100000, ) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 59a9cfa378..e205f79957 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -117,7 +117,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) == current_lsn + assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should shoud not be less than the one stored on remote storage' assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 41907adf1a..7e71c0a157 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -109,10 +109,11 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - env.zenith_cli.create_root_branch('main', tenant_id=tenant) - env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start(branch_name='main', + # attach does not download ancestor branches (should it?), just use root branch for now + env.zenith_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) + + tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation', node_name='test_tenant_relocation', tenant_id=tenant) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 94059e2a4c..702c27a79b 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -573,7 +573,9 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] # fetch something sensible from status - epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch = tli_status.acceptor_epoch + timeline_start_lsn = tli_status.timeline_start_lsn pg.safe_psql("create table t(i int)") @@ -581,9 +583,13 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): pg.stop().start() pg.safe_psql("insert into t values(10)") - epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch_after_reboot = tli_status.acceptor_epoch assert epoch_after_reboot > epoch + # and timeline_start_lsn stays the same + assert tli_status.timeline_start_lsn == timeline_start_lsn + class SafekeeperEnv: def __init__(self, diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 93912d2da7..d70f57aa52 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -130,7 +130,10 @@ class VanillaCompare(PgCompare): def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark - vanilla_pg.configure(['shared_buffers=1MB']) + vanilla_pg.configure([ + 'shared_buffers=1MB', + 'synchronous_commit=off', + ]) vanilla_pg.start() # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index e16d1acf2f..3bb7c606d3 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -472,20 +472,16 @@ class ZenithEnvBuilder: mock_endpoint = self.s3_mock_server.endpoint() mock_region = self.s3_mock_server.region() - mock_access_key = self.s3_mock_server.access_key() - mock_secret_key = self.s3_mock_server.secret_key() boto3.client( 's3', endpoint_url=mock_endpoint, region_name=mock_region, - aws_access_key_id=mock_access_key, - aws_secret_access_key=mock_secret_key, + aws_access_key_id=self.s3_mock_server.access_key(), + aws_secret_access_key=self.s3_mock_server.secret_key(), ).create_bucket(Bucket=bucket_name) self.pageserver_remote_storage = S3Storage(bucket=bucket_name, endpoint=mock_endpoint, - region=mock_region, - access_key=mock_access_key, - secret_key=mock_secret_key) + region=mock_region) def __enter__(self): return self @@ -811,8 +807,6 @@ class LocalFsStorage: class S3Storage: bucket: str region: str - access_key: Optional[str] - secret_key: Optional[str] endpoint: Optional[str] @@ -980,12 +974,32 @@ class ZenithCli: res.check_returncode() return res + def pageserver_enabled_features(self) -> Any: + bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + args = [bin_pageserver, '--enabled-features'] + log.info('Running command "{}"'.format(' '.join(args))) + + res = subprocess.run(args, + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + log.info(f"pageserver_enabled_features success: {res.stdout}") + return json.loads(res.stdout) + def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] append_pageserver_param_overrides(start_args, self.env.pageserver.remote_storage, self.env.pageserver.config_override) - return self.raw_cli(start_args) + + s3_env_vars = None + if self.env.s3_mock_server: + s3_env_vars = { + 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), + } + return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': cmd = ['pageserver', 'stop'] @@ -1080,6 +1094,7 @@ class ZenithCli: def raw_cli(self, arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True) -> 'subprocess.CompletedProcess[str]': """ Run "zenith" with the specified arguments. @@ -1095,7 +1110,7 @@ class ZenithCli: assert type(arguments) == list - bin_zenith = os.path.join(str(zenith_binpath), 'zenith') + bin_zenith = os.path.join(str(zenith_binpath), 'neon_local') args = [bin_zenith] + arguments log.info('Running command "{}"'.format(' '.join(args))) @@ -1104,9 +1119,10 @@ class ZenithCli: env_vars = os.environ.copy() env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir) env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) - if self.env.rust_log_override is not None: env_vars['RUST_LOG'] = self.env.rust_log_override + for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value # Pass coverage settings var = 'LLVM_PROFILE_FILE' @@ -1204,10 +1220,6 @@ def append_pageserver_param_overrides( pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\ bucket_region='{pageserver_remote_storage.region}'" - if pageserver_remote_storage.access_key is not None: - pageserver_storage_override += f",access_key_id='{pageserver_remote_storage.access_key}'" - if pageserver_remote_storage.secret_key is not None: - pageserver_storage_override += f",secret_access_key='{pageserver_remote_storage.secret_key}'" if pageserver_remote_storage.endpoint is not None: pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'" @@ -1302,7 +1314,7 @@ class VanillaPostgres(PgProtocol): """Append lines into postgresql.conf file.""" assert not self.running with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: - conf_file.writelines(options) + conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): assert not self.running @@ -1382,8 +1394,8 @@ def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: class ZenithProxy(PgProtocol): def __init__(self, port: int): super().__init__(host="127.0.0.1", - user="pytest", - password="pytest", + user="proxy_user", + password="pytest2", port=port, dbname='postgres') self.http_port = 7001 @@ -1399,8 +1411,8 @@ class ZenithProxy(PgProtocol): args = [bin_proxy] args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-method", "password"]) - args.extend(["--static-router", addr]) + args.extend(["--auth-backend", "postgres"]) + args.extend(["--auth-endpoint", "postgres://proxy_auth:pytest1@localhost:5432/postgres"]) self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1422,7 +1434,8 @@ class ZenithProxy(PgProtocol): def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: """Zenith proxy that routes directly to vanilla postgres.""" vanilla_pg.start() - vanilla_pg.safe_psql("create user pytest with password 'pytest';") + vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") + vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") with ZenithProxy(4432) as proxy: proxy.start_static() @@ -1571,6 +1584,7 @@ class Postgres(PgProtocol): assert self.node_name is not None self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None + self.running = False return self @@ -1747,6 +1761,7 @@ class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str remote_consistent_lsn: str + timeline_start_lsn: str @dataclass @@ -1771,7 +1786,8 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn']) + remote_consistent_lsn=resj['remote_consistent_lsn'], + timeline_start_lsn=resj['timeline_start_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md new file mode 100644 index 0000000000..776565b679 --- /dev/null +++ b/test_runner/performance/README.md @@ -0,0 +1,23 @@ +# What performance tests do we have and how we run them + +Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison. + +## Tests that are run against local installation + +Most of the performance tests run against a local installation. This is not very representative of a production environment. Firstly, Postgres, safekeeper(s) and the pageserver have to share CPU and I/O resources, which can add noise to the results. Secondly, network overhead is eliminated. + +In the CI, the performance tests are run in the same environment as the other integration tests. We don't have control over the host that the CI runs on, so the environment may vary widely from one run to another, which makes the results across different runs noisy to compare. + +## Remote tests + +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters. + +## Noise + +All tests run only once. Usually to obtain more consistent performance numbers, a test should be repeated multiple times and the results be aggregated, for example by taking min, max, avg, or median. + +## Results collection + +Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. + +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py new file mode 100644 index 0000000000..2042b0d548 --- /dev/null +++ b/test_runner/performance/test_hot_page.py @@ -0,0 +1,36 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_hot_page(env: PgCompare): + # Update the same page many times, then measure read performance + num_writes = 1000000 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + + # Write many updates to the same row + with env.record_duration('write'): + cur.execute('create table t (i integer);') + cur.execute('insert into t values (0);') + for i in range(num_writes): + cur.execute(f'update t set i = {i};') + + # Write 3-4 MB to evict t from compute cache + cur.execute('create table f (i integer);') + cur.execute(f'insert into f values (generate_series(1,100000));') + + # Read + with env.record_duration('read'): + cur.execute('select * from t;') + cur.fetchall() diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py new file mode 100644 index 0000000000..11e047b8c3 --- /dev/null +++ b/test_runner/performance/test_hot_table.py @@ -0,0 +1,35 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_hot_table(env: PgCompare): + # Update a small table many times, then measure read performance + num_rows = 100000 # Slightly larger than shared buffers size TODO validate + num_writes = 1000000 + num_reads = 10 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + + # Write many updates to a small table + with env.record_duration('write'): + cur.execute('create table t (i integer primary key);') + cur.execute(f'insert into t values (generate_series(1,{num_rows}));') + for i in range(num_writes): + cur.execute(f'update t set i = {i + num_rows} WHERE i = {i};') + + # Read the table + with env.record_duration('read'): + for i in range(num_reads): + cur.execute('select * from t;') + cur.fetchall() diff --git a/vendor/postgres b/vendor/postgres index a13fe64a3e..9a9459a7f9 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a13fe64a3eff1743ff17141a2e6057f5103829f0 +Subproject commit 9a9459a7f9cbcaa0e35ff1f2f34c419238fdec7e diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f178b5b766..92877faef7 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -14,29 +14,40 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } +fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } +futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } +futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hex = { version = "0.4", features = ["alloc", "serde", "std"] } +hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } +itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } num-integer = { version = "0.1", default-features = false, features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "std"] } -prost = { version = "0.9", features = ["prost-derive", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tokio-util = { version = "0.7", features = ["codec", "io"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["lazy_static", "std"] } [build-dependencies] +ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } @@ -46,7 +57,7 @@ indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -prost = { version = "0.9", features = ["prost-derive", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }