mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 23:50:36 +00:00
Compare commits
3 Commits
fixing-dur
...
bojan-repl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d67fb28a59 | ||
|
|
faa15e32ee | ||
|
|
8dbd6313e5 |
@@ -7,7 +7,7 @@ RELEASE=${RELEASE:-false}
|
||||
# look at docker hub for latest tag for neon docker image
|
||||
if [ "${RELEASE}" = "true" ]; then
|
||||
echo "search latest relase tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
|
||||
fi
|
||||
else
|
||||
echo "search latest dev tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
|
||||
@@ -15,4 +15,3 @@ console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
safekeeper_enable_s3_offload = true
|
||||
|
||||
@@ -4,8 +4,8 @@ zenith-us-stage-ps-2 console_region_id=27
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-2 console_region_id=27
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
@@ -16,4 +16,3 @@ console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -584,7 +584,7 @@ jobs:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade zenith-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-release:
|
||||
|
||||
@@ -6,8 +6,7 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
authEndpoint: "http://console-staging.local:9095/management/api/v2"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -21,7 +20,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: *.cloud.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
323
Cargo.lock
generated
323
Cargo.lock
generated
@@ -113,49 +113,6 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
"bitflags",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"itoa 1.0.1",
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"mime",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.64"
|
||||
@@ -363,15 +320,6 @@ dependencies = [
|
||||
"textwrap 0.14.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cmake"
|
||||
version = "0.1.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "combine"
|
||||
version = "4.6.3"
|
||||
@@ -382,18 +330,6 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "comfy-table"
|
||||
version = "5.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e"
|
||||
dependencies = [
|
||||
"crossterm",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
@@ -590,31 +526,6 @@ dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
version = "0.23.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"crossterm_winapi",
|
||||
"libc",
|
||||
"mio",
|
||||
"parking_lot 0.12.0",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm_winapi"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.1.3"
|
||||
@@ -782,9 +693,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "etcd-client"
|
||||
version = "0.9.1"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954"
|
||||
checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118"
|
||||
dependencies = [
|
||||
"http",
|
||||
"prost",
|
||||
@@ -792,26 +703,9 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tower",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"etcd-client",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fail"
|
||||
version = "0.5.0"
|
||||
@@ -1096,12 +990,6 @@ dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
@@ -1167,12 +1055,6 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-range-header"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29"
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.6.0"
|
||||
@@ -1438,12 +1320,6 @@ version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb"
|
||||
|
||||
[[package]]
|
||||
name = "md-5"
|
||||
version = "0.9.1"
|
||||
@@ -1574,24 +1450,6 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neon_local"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 3.0.14",
|
||||
"comfy-table",
|
||||
"control_plane",
|
||||
"git-version",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"safekeeper",
|
||||
"serde_json",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.1"
|
||||
@@ -1764,6 +1622,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -1774,7 +1633,6 @@ dependencies = [
|
||||
"daemonize",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
@@ -1792,7 +1650,8 @@ dependencies = [
|
||||
"pprof",
|
||||
"rand",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"rusoto_core",
|
||||
"rusoto_s3",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1804,6 +1663,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util 0.7.0",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -2062,16 +1922,6 @@ version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.19"
|
||||
@@ -2103,9 +1953,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.10.3"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f"
|
||||
checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive",
|
||||
@@ -2113,14 +1963,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prost-build"
|
||||
version = "0.10.3"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846"
|
||||
checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"cfg-if",
|
||||
"cmake",
|
||||
"heck 0.4.0",
|
||||
"heck",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
@@ -2135,9 +1983,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.10.1"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc"
|
||||
checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools",
|
||||
@@ -2148,9 +1996,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.10.1"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
|
||||
checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost",
|
||||
@@ -2166,7 +2014,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"clap 3.0.14",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown",
|
||||
"hex",
|
||||
"hmac 0.12.1",
|
||||
@@ -2323,9 +2170,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.5.5"
|
||||
version = "1.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
|
||||
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@@ -2347,23 +2194,6 @@ version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
|
||||
[[package]]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"rusoto_core",
|
||||
"rusoto_s3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-util 0.7.0",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remove_dir_all"
|
||||
version = "0.5.3"
|
||||
@@ -2463,9 +2293,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rusoto_core"
|
||||
version = "0.48.0"
|
||||
version = "0.47.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2"
|
||||
checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64",
|
||||
@@ -2488,9 +2318,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rusoto_credential"
|
||||
version = "0.48.0"
|
||||
version = "0.47.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05"
|
||||
checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"chrono",
|
||||
@@ -2506,9 +2336,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rusoto_s3"
|
||||
version = "0.48.0"
|
||||
version = "0.47.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d"
|
||||
checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -2519,9 +2349,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rusoto_signature"
|
||||
version = "0.48.0"
|
||||
version = "0.47.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272"
|
||||
checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
@@ -2617,9 +2447,8 @@ dependencies = [
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"etcd-client",
|
||||
"fs2",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
@@ -2629,7 +2458,8 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"rusoto_core",
|
||||
"rusoto_s3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -2834,17 +2664,6 @@ dependencies = [
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-mio"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"mio",
|
||||
"signal-hook",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.0"
|
||||
@@ -2934,25 +2753,6 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.23.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.23.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
|
||||
dependencies = [
|
||||
"heck 0.3.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.4.1"
|
||||
@@ -2984,21 +2784,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.92"
|
||||
version = "1.0.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52"
|
||||
checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sync_wrapper"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.38"
|
||||
@@ -3292,13 +3086,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.7.2"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb"
|
||||
checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum",
|
||||
"base64",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
@@ -3314,7 +3107,7 @@ dependencies = [
|
||||
"prost-derive",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util 0.7.0",
|
||||
"tokio-util 0.6.9",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -3324,11 +3117,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.7.2"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
|
||||
checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757"
|
||||
dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build",
|
||||
"quote",
|
||||
@@ -3355,25 +3147,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-http"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-range-header",
|
||||
"pin-project-lite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.1"
|
||||
@@ -3815,22 +3588,13 @@ dependencies = [
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 2.34.0",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"hashbrown",
|
||||
"hex",
|
||||
"hyper",
|
||||
"indexmap",
|
||||
"itoa 0.4.8",
|
||||
"libc",
|
||||
"log",
|
||||
"memchr",
|
||||
@@ -3844,7 +3608,6 @@ dependencies = [
|
||||
"serde",
|
||||
"syn",
|
||||
"tokio",
|
||||
"tokio-util 0.7.0",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
]
|
||||
@@ -3873,6 +3636,22 @@ dependencies = [
|
||||
"chrono",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zenith"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 3.0.14",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"safekeeper",
|
||||
"serde_json",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.5.2"
|
||||
|
||||
@@ -6,7 +6,7 @@ members = [
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"workspace_hack",
|
||||
"neon_local",
|
||||
"zenith",
|
||||
"libs/*",
|
||||
]
|
||||
|
||||
|
||||
63
README.md
63
README.md
@@ -49,30 +49,32 @@ make -j5
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
|
||||
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
|
||||
> ./target/debug/zenith init
|
||||
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
|
||||
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
|
||||
created main branch
|
||||
pageserver init succeeded
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at 'localhost:64000' in '.zenith'
|
||||
Pageserver started
|
||||
initializing for sk 1 for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
||||
initializing for single for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
|
||||
Safekeeper started
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
|
||||
```
|
||||
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
@@ -92,25 +94,18 @@ postgres=# select * from t;
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/neon_local timeline branch --branch-name migration_check
|
||||
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
|
||||
> ./target/debug/zenith timeline branch --branch-name migration_check
|
||||
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/neon_local timeline list
|
||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||
> ./target/debug/zenith timeline list
|
||||
main [5b014a9e41b4b63ce1a1febc04503636]
|
||||
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
|
||||
|
||||
# check the new list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
|
||||
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
@@ -123,20 +118,12 @@ postgres=# select * from t;
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
you have just started. You can stop them all with one command:
|
||||
```sh
|
||||
> ./target/debug/neon_local stop
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
@@ -136,20 +136,13 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
} else {
|
||||
info!("role name: '{}'", &name);
|
||||
info!("role name {}", &name);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
||||
info!("role create query: '{}'", &query);
|
||||
info!("role create query {}", &query);
|
||||
info_print!(" -> create");
|
||||
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
||||
name.quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
|
||||
info_print!("\n");
|
||||
|
||||
@@ -63,10 +63,6 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub broker_endpoints: Option<String>,
|
||||
|
||||
/// A prefix to all to any key when pushing/polling etcd from a node.
|
||||
#[serde(default)]
|
||||
pub broker_etcd_prefix: Option<String>,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
#[serde(default)]
|
||||
|
||||
@@ -77,7 +77,6 @@ pub struct SafekeeperNode {
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
|
||||
broker_endpoints: Option<String>,
|
||||
broker_etcd_prefix: Option<String>,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
@@ -95,7 +94,6 @@ impl SafekeeperNode {
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
pageserver,
|
||||
broker_endpoints: env.broker_endpoints.clone(),
|
||||
broker_etcd_prefix: env.broker_etcd_prefix.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,9 +143,6 @@ impl SafekeeperNode {
|
||||
if let Some(ref ep) = self.broker_endpoints {
|
||||
cmd.args(&["--broker-endpoints", ep]);
|
||||
}
|
||||
if let Some(prefix) = self.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
|
||||
@@ -167,9 +167,6 @@ impl PageServerNode {
|
||||
);
|
||||
}
|
||||
|
||||
// echo the captured output of the init command
|
||||
println!("{}", String::from_utf8_lossy(&init_output.stdout));
|
||||
|
||||
Ok(initial_timeline_id)
|
||||
}
|
||||
|
||||
@@ -189,6 +186,8 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
|
||||
let repo_path = self.repo_path();
|
||||
let mut args = vec!["-D", repo_path.to_str().unwrap()];
|
||||
|
||||
@@ -196,11 +195,9 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
@@ -460,12 +457,3 @@ impl PageServerNode {
|
||||
Ok(timeline_info_response)
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
|
||||
S3. WAL that has been archived to S3 can be removed from the
|
||||
safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
|
||||
```
|
||||
|
||||
+----------------+
|
||||
+-----> | WAL safekeeper |
|
||||
| +----------------+
|
||||
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
\
|
||||
\
|
||||
\
|
||||
\ +--------+
|
||||
\ | |
|
||||
+------> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
\ +--------+
|
||||
\ | |
|
||||
+--> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
|
||||
|
||||
```
|
||||
Every WAL safekeeper holds a section of WAL, and a VCL value.
|
||||
The WAL can be divided into three portions:
|
||||
|
||||
```
|
||||
|
||||
VCL LSN
|
||||
| |
|
||||
V V
|
||||
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
|
||||
Archived WAL Completed WAL In-flight WAL
|
||||
```
|
||||
|
||||
|
||||
Note that all this WAL kept in a safekeeper is a contiguous section.
|
||||
This is different from Aurora: In Aurora, there can be holes in the
|
||||
|
||||
@@ -6,6 +6,7 @@ If there's no such file during `init` phase of the server, it creates the file i
|
||||
There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
|
||||
the values in the config file, if any are specified for the same key and get into the final config during init phase.
|
||||
|
||||
|
||||
### Config example
|
||||
|
||||
```toml
|
||||
@@ -34,9 +35,9 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error
|
||||
|
||||
Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and
|
||||
|
||||
- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
|
||||
- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
|
||||
### Config values
|
||||
|
||||
@@ -56,7 +57,7 @@ but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
@@ -71,7 +72,7 @@ The unit is # of bytes.
|
||||
|
||||
Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### compaction_target_size
|
||||
|
||||
@@ -162,12 +163,16 @@ bucket_region = 'eu-north-1'
|
||||
# Optional, pageserver uses entire bucket if the prefix is not specified.
|
||||
prefix_in_bucket = '/some/prefix/'
|
||||
|
||||
# Access key to connect to the bucket ("login" part of the credentials)
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
# Secret access key to connect to the bucket ("password" part of the credentials)
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
|
||||
# S3 API query limit to avoid getting errors/throttling from AWS.
|
||||
concurrency_limit = 100
|
||||
```
|
||||
|
||||
If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
|
||||
|
||||
###### General remote storage configuration
|
||||
|
||||
Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
|
||||
@@ -178,12 +183,13 @@ Besides, there are parameters common for all types of remote storage that can be
|
||||
```toml
|
||||
[remote_storage]
|
||||
# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
|
||||
max_concurrent_syncs = 50
|
||||
max_concurrent_timelines_sync = 50
|
||||
|
||||
# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
|
||||
max_sync_errors = 10
|
||||
```
|
||||
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
[package]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
etcd-client = "0.9.0"
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio = "1"
|
||||
tracing = "0.1"
|
||||
thiserror = "1"
|
||||
@@ -1,344 +0,0 @@
|
||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||
//! Intended to connect services to each other, not to store their data.
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
fmt::Display,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use regex::{Captures, Regex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
pub use etcd_client::*;
|
||||
|
||||
use tokio::{sync::mpsc, task::JoinHandle};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
||||
};
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
struct SafekeeperTimeline {
|
||||
safekeeper_id: ZNodeId,
|
||||
info: SkTimelineInfo,
|
||||
}
|
||||
|
||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub s3_wal_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connection_string: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BrokerError {
|
||||
#[error("Etcd client error: {0}. Context: {1}")]
|
||||
EtcdClient(etcd_client::Error, String),
|
||||
#[error("Error during parsing etcd data: {0}")]
|
||||
ParsingError(String),
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
/// A way to control the data retrieval from a certain subscription.
|
||||
pub struct SkTimelineSubscription {
|
||||
safekeeper_timeline_updates:
|
||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
|
||||
kind: SkTimelineSubscriptionKind,
|
||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||
watcher: Watcher,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscription {
|
||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
||||
pub async fn fetch_data(
|
||||
&mut self,
|
||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
|
||||
self.safekeeper_timeline_updates.recv().await
|
||||
}
|
||||
|
||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||
self.watcher.cancel().await.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!(
|
||||
"Failed to cancel timeline subscription, kind: {:?}",
|
||||
self.kind
|
||||
),
|
||||
)
|
||||
})?;
|
||||
self.watcher_handle.await.map_err(|e| {
|
||||
BrokerError::InternalError(format!(
|
||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
||||
self.kind
|
||||
))
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SkTimelineSubscriptionKind {
|
||||
broker_prefix: String,
|
||||
kind: SubscriptionKind,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscriptionKind {
|
||||
pub fn all(broker_prefix: String) -> Self {
|
||||
Self {
|
||||
broker_prefix,
|
||||
kind: SubscriptionKind::All,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self {
|
||||
Self {
|
||||
broker_prefix,
|
||||
kind: SubscriptionKind::Tenant(tenant),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||
Self {
|
||||
broker_prefix,
|
||||
kind: SubscriptionKind::Timeline(timeline),
|
||||
}
|
||||
}
|
||||
|
||||
fn watch_regex(&self) -> Regex {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => Regex::new(&format!(
|
||||
r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_prefix
|
||||
))
|
||||
.expect("wrong regex for 'everything' subscription"),
|
||||
SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_prefix
|
||||
))
|
||||
.expect("wrong regex for 'tenant' subscription"),
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
|
||||
self.broker_prefix
|
||||
))
|
||||
.expect("wrong regex for 'timeline' subscription"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||
pub fn watch_key(&self) -> String {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => self.broker_prefix.to_string(),
|
||||
SubscriptionKind::Tenant(tenant_id) => {
|
||||
format!("{}/{tenant_id}/safekeeper", self.broker_prefix)
|
||||
}
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => format!(
|
||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
||||
self.broker_prefix
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
enum SubscriptionKind {
|
||||
/// Get every timeline update.
|
||||
All,
|
||||
/// Get certain tenant timelines' updates.
|
||||
Tenant(ZTenantId),
|
||||
/// Get certain timeline updates.
|
||||
Timeline(ZTenantTimelineId),
|
||||
}
|
||||
|
||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||
/// Stops and returns `Err` on any error during etcd communication.
|
||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||
/// exiting normally in such cases.
|
||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
||||
client: &mut Client,
|
||||
subscription: SkTimelineSubscriptionKind,
|
||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
||||
|
||||
let (watcher, mut stream) = client
|
||||
.watch(
|
||||
subscription.watch_key(),
|
||||
Some(WatchOptions::new().with_prefix()),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
||||
|
||||
let subscription_kind = subscription.kind;
|
||||
let regex = subscription.watch_regex();
|
||||
let watcher_handle = tokio::spawn(async move {
|
||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||
"Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
|
||||
)))? {
|
||||
if resp.canceled() {
|
||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
|
||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
||||
|
||||
|
||||
let events = resp.events();
|
||||
debug!("Processing {} events", events.len());
|
||||
|
||||
for event in events {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(new_etcd_kv) = event.kv() {
|
||||
let new_kv_version = new_etcd_kv.version();
|
||||
|
||||
match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) {
|
||||
Ok(Some((zttid, timeline))) => {
|
||||
match timeline_updates
|
||||
.entry(zttid)
|
||||
.or_default()
|
||||
.entry(timeline.safekeeper_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
||||
if old_etcd_kv_version < new_kv_version {
|
||||
o.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
});
|
||||
|
||||
Ok(SkTimelineSubscription {
|
||||
kind: subscription,
|
||||
safekeeper_timeline_updates,
|
||||
watcher_handle,
|
||||
watcher,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_etcd_key_value(
|
||||
subscription_kind: SubscriptionKind,
|
||||
regex: &Regex,
|
||||
kv: &KeyValue,
|
||||
) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
|
||||
let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
|
||||
})?) {
|
||||
caps
|
||||
} else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (zttid, safekeeper_id) = match subscription_kind {
|
||||
SubscriptionKind::All => (
|
||||
ZTenantTimelineId::new(
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Tenant(tenant_id) => (
|
||||
ZTenantTimelineId::new(
|
||||
tenant_id,
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Timeline(zttid) => (
|
||||
zttid,
|
||||
ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
};
|
||||
|
||||
let info_str = kv.value_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
|
||||
})?;
|
||||
Ok(Some((
|
||||
zttid,
|
||||
SafekeeperTimeline {
|
||||
safekeeper_id,
|
||||
info: serde_json::from_str(info_str).map_err(|e| {
|
||||
BrokerError::ParsingError(format!(
|
||||
"Failed to parse '{info_str}' as safekeeper timeline info: {e}"
|
||||
))
|
||||
})?,
|
||||
},
|
||||
)))
|
||||
}
|
||||
|
||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: Display,
|
||||
{
|
||||
let capture_match = caps
|
||||
.get(index)
|
||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||
.as_str();
|
||||
capture_match.parse().map_err(|e| {
|
||||
format!(
|
||||
"Failed to parse {} from {capture_match}: {e}",
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -8,7 +8,6 @@
|
||||
#![allow(deref_nullptr)]
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
@@ -38,21 +37,3 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) ->
|
||||
let diff = id1.wrapping_sub(id2) as i32;
|
||||
diff < 0
|
||||
}
|
||||
|
||||
// Check if page is not yet initialized (port of Postgres PageIsInit() macro)
|
||||
pub fn page_is_new(pg: &[u8]) -> bool {
|
||||
pg[14] == 0 && pg[15] == 0 // pg_upper == 0
|
||||
}
|
||||
|
||||
// ExtractLSN from page header
|
||||
pub fn page_get_lsn(pg: &[u8]) -> Lsn {
|
||||
Lsn(
|
||||
((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32)
|
||||
| u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
|
||||
pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
}
|
||||
|
||||
@@ -89,12 +89,7 @@ impl WalStreamDecoder {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("long header deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
@@ -111,12 +106,7 @@ impl WalStreamDecoder {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("header deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
@@ -198,13 +188,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec =
|
||||
XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("xlog record deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::XLogPageHeaderData;
|
||||
use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -28,8 +28,6 @@ use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
@@ -146,7 +144,7 @@ fn find_end_of_wal_segment(
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
start_offset: usize, // start reading at this point
|
||||
) -> anyhow::Result<u32> {
|
||||
) -> Result<u32> {
|
||||
// step back to the beginning of the page to read it in...
|
||||
let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
|
||||
let mut contlen: usize = 0;
|
||||
@@ -274,7 +272,7 @@ pub fn find_end_of_wal(
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
start_lsn: Lsn, // start reading WAL at this point or later
|
||||
) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
|
||||
) -> Result<(XLogRecPtr, TimeLineID)> {
|
||||
let mut high_segno: XLogSegNo = 0;
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
@@ -356,19 +354,19 @@ pub fn main() {
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
|
||||
pub fn from_slice(buf: &[u8]) -> XLogRecord {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf)
|
||||
XLogRecord::des(buf).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogRecord, DeserializeError> {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader())
|
||||
XLogRecord::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use utils::bin_ser::LeSer;
|
||||
Ok(self.ser()?.into())
|
||||
self.ser().unwrap().into()
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
@@ -378,35 +376,35 @@ impl XLogRecord {
|
||||
}
|
||||
|
||||
impl XLogPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogPageHeaderData, DeserializeError> {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogPageHeaderData::des_from(&mut buf.reader())
|
||||
XLogPageHeaderData::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogLongPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogLongPageHeaderData, DeserializeError> {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogLongPageHeaderData::des_from(&mut buf.reader())
|
||||
XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use utils::bin_ser::LeSer;
|
||||
self.ser().map(|b| b.into())
|
||||
self.ser().unwrap().into()
|
||||
}
|
||||
}
|
||||
|
||||
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use utils::bin_ser::LeSer;
|
||||
Ok(self.ser()?.into())
|
||||
self.ser().unwrap().into()
|
||||
}
|
||||
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, DeserializeError> {
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
|
||||
use utils::bin_ser::LeSer;
|
||||
CheckPoint::des(buf)
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
}
|
||||
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
@@ -444,7 +442,7 @@ impl CheckPoint {
|
||||
// Generate new, empty WAL segment.
|
||||
// We need this segment to start compute node.
|
||||
//
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
|
||||
@@ -464,12 +462,12 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
|
||||
let hdr_bytes = hdr.encode()?;
|
||||
let hdr_bytes = hdr.encode();
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
Ok(seg_buf.freeze())
|
||||
seg_buf.freeze()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
tracing = "0.1.27"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
async-trait = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
@@ -1,232 +0,0 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//!
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::{S3Bucket, S3ObjectKey},
|
||||
};
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
/// TODO kb
|
||||
pub enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn new(
|
||||
working_directory: PathBuf,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
/// Immutable, cannot be changed once the file is created.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
||||
pub max_concurrent_syncs: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
}
|
||||
}
|
||||
3
libs/utils/build.rs
Normal file
3
libs/utils/build.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-env-changed=GIT_VERSION");
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::error::ApiError;
|
||||
use hyper::{body::HttpBody, Body, Request};
|
||||
use hyper::{Body, Request};
|
||||
use routerify::ext::RequestExt;
|
||||
|
||||
pub fn get_request_param<'a>(
|
||||
@@ -31,10 +31,3 @@ pub fn parse_request_param<T: FromStr>(
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||
match request.body_mut().data().await {
|
||||
Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,44 +54,31 @@ pub mod nonblock;
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
/// * building locally from git repo
|
||||
/// * building in CI from git repo
|
||||
/// * building in docker (either in CI or locally)
|
||||
///
|
||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
/// Git version received from environment variable used as a fallback in git_version invokation.
|
||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
///
|
||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
/// Caching and workspaces complicates that. In case `utils` is not
|
||||
/// recompiled due to caching then version may become outdated.
|
||||
/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
|
||||
/// so if we changed the index state git_version will pick that up and rerun the macro.
|
||||
///
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
#[macro_export]
|
||||
macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &str = git_version::git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!(
|
||||
"git-env:",
|
||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
||||
),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
};
|
||||
}
|
||||
// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
//
|
||||
// we have several cases:
|
||||
// * building locally from git repo
|
||||
// * building in CI from git repo
|
||||
// * building in docker (either in CI or locally)
|
||||
//
|
||||
// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
// So everything becides docker build is covered by git_version crate.
|
||||
// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs).
|
||||
// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
// Git version received from environment variable used as a fallback in git_version invokation.
|
||||
// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
//
|
||||
// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
// Caching and workspaces complicates that. In case `utils` is not
|
||||
// recompiled due to caching then version may become outdated.
|
||||
// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
|
||||
// so if we changed the index state git_version will pick that up and rerun the macro.
|
||||
//
|
||||
// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
use git_version::git_version;
|
||||
pub const GIT_VERSION: &str = git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!("git-env:", env!("GIT_VERSION")),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
|
||||
@@ -433,12 +433,7 @@ impl PostgresBackend {
|
||||
// full cause of the error, not just the top-level context + its trace.
|
||||
// We don't want to send that in the ErrorResponse though,
|
||||
// because it's not relevant to the compute node logs.
|
||||
if query_string.starts_with("callmemaybe") {
|
||||
// FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed
|
||||
error!("query handler for '{}' failed: {}", query_string, e);
|
||||
} else {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
}
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
// TODO: untangle convoluted control flow
|
||||
if e.to_string().contains("failed to run") {
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
|
||||
@@ -224,7 +224,7 @@ impl fmt::Display for ZTenantTimelineId {
|
||||
|
||||
// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
|
||||
// by the console.
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct ZNodeId(pub u64);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[features]
|
||||
# It is simpler infra-wise to have failpoints enabled by default
|
||||
# It shouldn't affect perf in any way because failpoints
|
||||
# It shouldnt affect perf in any way because failpoints
|
||||
# are not placed in hot code paths
|
||||
default = ["failpoints"]
|
||||
profiling = ["pprof"]
|
||||
@@ -25,6 +25,7 @@ lazy_static = "1.4.0"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
@@ -52,12 +53,14 @@ nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
async-trait = "0.1"
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
@@ -155,9 +155,11 @@ or
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
prefix_in_bucket = '/test_prefix/'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
|
||||
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, ensure, Context, Result};
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
@@ -323,8 +323,7 @@ impl<'a> Basebackup<'a> {
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
|
||||
@@ -7,9 +7,7 @@ use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::page_cache;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
|
||||
@@ -20,18 +20,17 @@ use utils::{
|
||||
http::endpoint,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
GIT_VERSION,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{GIT_VERSION} profiling:{} failpoints:{}",
|
||||
"{} profiling:{} failpoints:{}",
|
||||
GIT_VERSION,
|
||||
cfg!(feature = "profiling"),
|
||||
fail::has_failpoints()
|
||||
)
|
||||
@@ -218,7 +217,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||
|
||||
info!("version: {GIT_VERSION}");
|
||||
info!("version: {}", GIT_VERSION);
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -288,7 +287,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
false,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
@@ -302,7 +301,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint thread",
|
||||
true,
|
||||
false,
|
||||
move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
|
||||
)?;
|
||||
|
||||
|
||||
75
pageserver/src/bin/replay.rs
Normal file
75
pageserver/src/bin/replay.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::{pg_constants::WAL_SEGMENT_SIZE, waldecoder::WalStreamDecoder};
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use tokio::net::TcpStream;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
|
||||
struct PageServiceApi {
|
||||
stream: TcpStream,
|
||||
}
|
||||
|
||||
impl PageServiceApi {
|
||||
async fn connect(tenant: &ZTenantId, timeline: &ZTimelineId, connstr: &str) -> Result<Self> {
|
||||
let mut stream = TcpStream::connect("localhost:15000").await?;
|
||||
|
||||
// Connect to pageserver
|
||||
// TODO read host, port, dbname, user from command line
|
||||
let (client, conn) = tokio_postgres::Config::new()
|
||||
.host("127.0.0.1")
|
||||
.port(15000)
|
||||
.dbname("postgres")
|
||||
.user("zenith_admin")
|
||||
.connect_raw(&mut stream, tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
let init_query = format!("callmemaybe {} {} {}", tenant, timeline, connstr);
|
||||
tokio::select! {
|
||||
_ = conn => panic!("connection closed during callmemaybe"),
|
||||
_ = client.query(init_query.as_str(), &[]) => (),
|
||||
};
|
||||
|
||||
Ok(Self { stream })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
use clap::{App, Arg};
|
||||
let arg_matches = App::new("Replay")
|
||||
.arg(
|
||||
Arg::new("tenant")
|
||||
.long("tenant")
|
||||
.takes_value(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("timeline")
|
||||
.long("timeline")
|
||||
.takes_value(true)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let partial_path = "/home/bojan/tmp/sk_wal";
|
||||
let startpos = Lsn(23761464); // I got this by grepping sk log for "restart decoder"
|
||||
let xlogoff: usize = startpos.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(startpos);
|
||||
let bytes = std::fs::read(partial_path)?;
|
||||
decoder.feed_bytes(&bytes[xlogoff..(xlogoff+10000)]);
|
||||
|
||||
while let Some((lsn, rec)) = decoder.poll_decode()? {
|
||||
println!("lsn: {}", lsn);
|
||||
}
|
||||
|
||||
// TODO start replication server, get connstr
|
||||
|
||||
let tenant = ZTenantId::from_str(arg_matches.value_of("tenant").unwrap())?;
|
||||
let timeline = ZTimelineId::from_str(arg_matches.value_of("timeline").unwrap())?;
|
||||
let connstr = "lol";
|
||||
let mut api = PageServiceApi::connect(&tenant, &timeline, connstr).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,9 +6,7 @@ use clap::{App, Arg};
|
||||
use pageserver::layered_repository::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use utils::{lsn::Lsn, GIT_VERSION};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith update metadata utility")
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config};
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -34,6 +33,18 @@ pub mod defaults {
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -304,6 +315,67 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_timelines_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
@@ -451,21 +523,21 @@ impl PageServerConf {
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
|
||||
let max_concurrent_syncs = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
||||
let max_concurrent_timelines_sync = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_timelines_sync", toml)?
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC),
|
||||
)
|
||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
||||
.context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?;
|
||||
|
||||
let max_sync_errors = NonZeroU32::new(
|
||||
parse_optional_integer("max_sync_errors", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
@@ -480,6 +552,16 @@ impl PageServerConf {
|
||||
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
access_key_id: toml
|
||||
.get("access_key_id")
|
||||
.map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
|
||||
.transpose()?,
|
||||
secret_access_key: toml
|
||||
.get("secret_access_key")
|
||||
.map(|secret_access_key| {
|
||||
parse_toml_string("secret_access_key", secret_access_key)
|
||||
})
|
||||
.transpose()?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
||||
@@ -497,7 +579,7 @@ impl PageServerConf {
|
||||
};
|
||||
|
||||
Ok(RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_timelines_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
})
|
||||
@@ -725,11 +807,11 @@ pg_distrib_dir='{}'
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
|
||||
max_concurrent_timelines_sync: NonZeroUsize::new(
|
||||
defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
@@ -747,25 +829,29 @@ pg_distrib_dir='{}'
|
||||
let bucket_name = "some-sample-bucket".to_string();
|
||||
let bucket_region = "eu-north-1".to_string();
|
||||
let prefix_in_bucket = "test_prefix".to_string();
|
||||
let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
|
||||
let secret_access_key = "SOMEsEcReTsd292v".to_string();
|
||||
let endpoint = "http://localhost:5000".to_string();
|
||||
let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
|
||||
let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap();
|
||||
let max_sync_errors = NonZeroU32::new(222).unwrap();
|
||||
let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
|
||||
|
||||
let identical_toml_declarations = &[
|
||||
format!(
|
||||
r#"[remote_storage]
|
||||
max_concurrent_syncs = {max_concurrent_syncs}
|
||||
max_concurrent_timelines_sync = {max_concurrent_timelines_sync}
|
||||
max_sync_errors = {max_sync_errors}
|
||||
bucket_name = '{bucket_name}'
|
||||
bucket_region = '{bucket_region}'
|
||||
prefix_in_bucket = '{prefix_in_bucket}'
|
||||
access_key_id = '{access_key_id}'
|
||||
secret_access_key = '{secret_access_key}'
|
||||
endpoint = '{endpoint}'
|
||||
concurrency_limit = {s3_concurrency_limit}"#
|
||||
),
|
||||
format!(
|
||||
"remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
|
||||
bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
|
||||
"remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
|
||||
bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
|
||||
),
|
||||
];
|
||||
|
||||
@@ -788,11 +874,13 @@ pg_distrib_dir='{}'
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_timelines_sync,
|
||||
max_sync_errors,
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: bucket_name.clone(),
|
||||
bucket_region: bucket_region.clone(),
|
||||
access_key_id: Some(access_key_id.clone()),
|
||||
secret_access_key: Some(secret_access_key.clone()),
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
|
||||
@@ -3,16 +3,17 @@ use std::sync::Arc;
|
||||
use anyhow::{Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TimelineCreateRequest,
|
||||
};
|
||||
use crate::config::RemoteStorageKind;
|
||||
use crate::remote_storage::{
|
||||
download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket,
|
||||
};
|
||||
use crate::repository::Repository;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
@@ -36,6 +37,11 @@ struct State {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -51,7 +57,14 @@ impl State {
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
|
||||
.map(|storage_config| match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
@@ -260,14 +273,14 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
let new_timeline = match try_download_index_part_data(state, sync_id).await {
|
||||
let new_timeline = match try_download_shard_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
@@ -296,32 +309,35 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn try_download_index_part_data(
|
||||
async fn try_download_shard_data(
|
||||
state: &State,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let index_part = match state.remote_storage.as_ref() {
|
||||
let shard = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
|
||||
download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
|
||||
download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
|
||||
.with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?;
|
||||
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
RemoteTimeline::from_index_part(&timeline_path, shard)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
|
||||
format!(
|
||||
"Failed to convert index shard into remote timeline for timeline {}",
|
||||
sync_id
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -274,7 +274,7 @@ fn import_control_file<R: Repository>(
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
|
||||
@@ -20,8 +20,8 @@ use tracing::*;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
@@ -34,9 +34,10 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex};
|
||||
use crate::repository::{
|
||||
GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter,
|
||||
};
|
||||
@@ -47,7 +48,6 @@ use crate::virtual_file::VirtualFile;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{page_cache, storage_sync};
|
||||
|
||||
use metrics::{
|
||||
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec,
|
||||
@@ -74,7 +74,6 @@ pub mod metadata;
|
||||
mod par_fsync;
|
||||
mod storage_layer;
|
||||
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use ephemeral_file::is_ephemeral_file;
|
||||
use filename::{DeltaFileName, ImageFileName};
|
||||
@@ -82,7 +81,6 @@ use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
use inmemory_layer::InMemoryLayer;
|
||||
use layer_map::LayerMap;
|
||||
use layer_map::SearchResult;
|
||||
use postgres_ffi::xlog_utils::to_pg_timestamp;
|
||||
use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
@@ -91,7 +89,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds",
|
||||
"pageserver_storage_time",
|
||||
"Time spent on storage operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
@@ -101,8 +99,8 @@ lazy_static! {
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
"pageserver_getpage_reconstruct_time",
|
||||
"Time spent on storage operations",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
@@ -110,13 +108,13 @@ lazy_static! {
|
||||
|
||||
lazy_static! {
|
||||
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"materialize_page_cache_hits",
|
||||
"Number of cache hits from materialized page cache",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"wait_lsn_time",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
@@ -136,12 +134,12 @@ lazy_static! {
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
lazy_static! {
|
||||
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_num_persistent_files_created",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"pageserver_persistent_bytes_written",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
@@ -395,22 +393,9 @@ impl Repository for LayeredRepository {
|
||||
|
||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
// check no child timelines, because detach will remove files, which will brake child branches
|
||||
// FIXME this can still be violated because we do not guarantee
|
||||
// that all ancestors are downloaded/attached to the same pageserver
|
||||
let num_children = timelines
|
||||
.iter()
|
||||
.filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id))
|
||||
.count();
|
||||
|
||||
ensure!(
|
||||
num_children == 0,
|
||||
"Cannot detach timeline which has child timelines"
|
||||
);
|
||||
|
||||
ensure!(
|
||||
timelines.remove(&timeline_id).is_some(),
|
||||
"Cannot detach timeline {timeline_id} that is not available locally"
|
||||
"cannot detach timeline {timeline_id} that is not available locally"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -430,7 +415,7 @@ impl Repository for LayeredRepository {
|
||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||
Entry::Vacant(entry) => {
|
||||
// we need to get metadata of a timeline, another option is to pass it along with Downloaded status
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
||||
let metadata = Self::load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
||||
// finally we make newly downloaded timeline visible to repository
|
||||
entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
|
||||
},
|
||||
@@ -457,7 +442,7 @@ enum LayeredTimelineEntry {
|
||||
impl LayeredTimelineEntry {
|
||||
fn timeline_id(&self) -> ZTimelineId {
|
||||
match self {
|
||||
LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id,
|
||||
LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid,
|
||||
LayeredTimelineEntry::Unloaded { id, .. } => *id,
|
||||
}
|
||||
}
|
||||
@@ -617,17 +602,21 @@ impl LayeredRepository {
|
||||
|
||||
fn load_local_timeline(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timelineid: ZTimelineId,
|
||||
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> anyhow::Result<Arc<LayeredTimeline>> {
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenant_id)
|
||||
.context("failed to load metadata")?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
|
||||
let ancestor = metadata
|
||||
.ancestor_timeline()
|
||||
.map(|ancestor_timeline_id| {
|
||||
trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id);
|
||||
trace!(
|
||||
"loading {}'s ancestor {}",
|
||||
timelineid,
|
||||
&ancestor_timeline_id
|
||||
);
|
||||
self.get_timeline_load_internal(ancestor_timeline_id, timelines)
|
||||
})
|
||||
.transpose()
|
||||
@@ -641,7 +630,7 @@ impl LayeredRepository {
|
||||
Arc::clone(&self.tenant_conf),
|
||||
metadata,
|
||||
ancestor,
|
||||
timeline_id,
|
||||
timelineid,
|
||||
self.tenant_id,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.upload_layers,
|
||||
@@ -774,6 +763,17 @@ impl LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
info!("loading metadata from {}", path.display());
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
}
|
||||
|
||||
//
|
||||
// How garbage collection works:
|
||||
//
|
||||
@@ -900,8 +900,8 @@ pub struct LayeredTimeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
|
||||
layers: RwLock<LayerMap>,
|
||||
|
||||
@@ -1175,50 +1175,50 @@ impl LayeredTimeline {
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<LayeredTimelineEntry>,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
upload_layers: bool,
|
||||
) -> LayeredTimeline {
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
.unwrap();
|
||||
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
|
||||
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
.unwrap();
|
||||
let flush_time_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
"layer flush",
|
||||
&tenant_id.to_string(),
|
||||
&timeline_id.to_string(),
|
||||
&tenantid.to_string(),
|
||||
&timelineid.to_string(),
|
||||
])
|
||||
.unwrap();
|
||||
let compact_time_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
"compact",
|
||||
&tenant_id.to_string(),
|
||||
&timeline_id.to_string(),
|
||||
&tenantid.to_string(),
|
||||
&timelineid.to_string(),
|
||||
])
|
||||
.unwrap();
|
||||
let create_images_time_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
"create images",
|
||||
&tenant_id.to_string(),
|
||||
&timeline_id.to_string(),
|
||||
&tenantid.to_string(),
|
||||
&timelineid.to_string(),
|
||||
])
|
||||
.unwrap();
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
.unwrap();
|
||||
let wait_lsn_time_histo = WAIT_LSN_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
.unwrap();
|
||||
|
||||
LayeredTimeline {
|
||||
conf,
|
||||
tenant_conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
timelineid,
|
||||
tenantid,
|
||||
layers: RwLock::new(LayerMap::default()),
|
||||
|
||||
walredo_mgr,
|
||||
@@ -1270,7 +1270,7 @@ impl LayeredTimeline {
|
||||
|
||||
// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||
// structs representing all files on disk
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
|
||||
|
||||
for direntry in fs::read_dir(timeline_path)? {
|
||||
let direntry = direntry?;
|
||||
@@ -1282,7 +1282,7 @@ impl LayeredTimeline {
|
||||
if imgfilename.lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
imgfilename, self.timeline_id, disk_consistent_lsn
|
||||
imgfilename, self.timelineid, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(direntry.path())?;
|
||||
@@ -1290,7 +1290,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
let layer =
|
||||
ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
|
||||
ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
@@ -1305,7 +1305,7 @@ impl LayeredTimeline {
|
||||
if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
|
||||
warn!(
|
||||
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
deltafilename, self.timeline_id, disk_consistent_lsn
|
||||
deltafilename, self.timelineid, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(direntry.path())?;
|
||||
@@ -1313,7 +1313,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
let layer =
|
||||
DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
|
||||
DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
@@ -1495,7 +1495,7 @@ impl LayeredTimeline {
|
||||
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
|
||||
// We should look at the key to determine if it's a cacheable object
|
||||
let (lsn, read_guard) =
|
||||
cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
|
||||
cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?;
|
||||
let img = Bytes::from(read_guard.to_vec());
|
||||
Some((lsn, img))
|
||||
}
|
||||
@@ -1507,15 +1507,15 @@ impl LayeredTimeline {
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
|
||||
self.timeline_id,
|
||||
self.timelineid,
|
||||
self.get_ancestor_timeline_id(),
|
||||
)
|
||||
})?
|
||||
.ensure_loaded()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}",
|
||||
self.timeline_id,
|
||||
"Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}",
|
||||
self.timelineid,
|
||||
self.get_ancestor_timeline_id(),
|
||||
)
|
||||
})?;
|
||||
@@ -1552,12 +1552,12 @@ impl LayeredTimeline {
|
||||
|
||||
trace!(
|
||||
"creating layer for write at {}/{} for record at {}",
|
||||
self.timeline_id,
|
||||
self.timelineid,
|
||||
start_lsn,
|
||||
lsn
|
||||
);
|
||||
let new_layer =
|
||||
InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?;
|
||||
InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?;
|
||||
let layer_rc = Arc::new(new_layer);
|
||||
|
||||
layers.open_layer = Some(Arc::clone(&layer_rc));
|
||||
@@ -1631,8 +1631,8 @@ impl LayeredTimeline {
|
||||
let self_clone = Arc::clone(self);
|
||||
thread_mgr::spawn(
|
||||
thread_mgr::ThreadKind::LayerFlushThread,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
Some(self.tenantid),
|
||||
Some(self.timelineid),
|
||||
"layer flush thread",
|
||||
false,
|
||||
move || self_clone.flush_frozen_layers(false),
|
||||
@@ -1701,7 +1701,7 @@ impl LayeredTimeline {
|
||||
// them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
new_delta_path.clone(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
self.conf.timeline_path(&self.timelineid, &self.tenantid),
|
||||
])?;
|
||||
fail_point!("checkpoint-before-sync");
|
||||
|
||||
@@ -1773,8 +1773,8 @@ impl LayeredTimeline {
|
||||
|
||||
LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
@@ -1783,11 +1783,11 @@ impl LayeredTimeline {
|
||||
PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
HashSet::from([new_delta_path]),
|
||||
Some(metadata),
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
new_delta_path,
|
||||
metadata,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1838,8 +1838,7 @@ impl LayeredTimeline {
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Define partitioning schema if needed
|
||||
if let Ok(pgdir) =
|
||||
tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
|
||||
if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenantid, self.timelineid)
|
||||
{
|
||||
let (partitioning, lsn) = pgdir.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
@@ -1848,21 +1847,11 @@ impl LayeredTimeline {
|
||||
let timer = self.create_images_time_histo.start_timer();
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len());
|
||||
for part in partitioning.parts.iter() {
|
||||
if self.time_for_new_image_layer(part, lsn)? {
|
||||
let new_path = self.create_image_layer(part, lsn)?;
|
||||
layer_paths_to_upload.insert(new_path);
|
||||
self.create_image_layer(part, lsn)?;
|
||||
}
|
||||
}
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
None,
|
||||
);
|
||||
}
|
||||
timer.stop_and_record();
|
||||
|
||||
// 3. Compact
|
||||
@@ -1883,7 +1872,7 @@ impl LayeredTimeline {
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
for (img_range, last_img) in image_coverage {
|
||||
let img_lsn = if let Some(last_img) = last_img {
|
||||
let img_lsn = if let Some(ref last_img) = last_img {
|
||||
last_img.get_lsn_range().end
|
||||
} else {
|
||||
Lsn(0)
|
||||
@@ -1904,11 +1893,11 @@ impl LayeredTimeline {
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<PathBuf> {
|
||||
fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> {
|
||||
let img_range =
|
||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||
let mut image_layer_writer =
|
||||
ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?;
|
||||
ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?;
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
@@ -1931,17 +1920,16 @@ impl LayeredTimeline {
|
||||
// and fsync them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
image_layer.path(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
self.conf.timeline_path(&self.timelineid, &self.tenantid),
|
||||
])?;
|
||||
|
||||
// FIXME: Do we need to do something to upload it to remote storage here?
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let new_path = image_layer.path();
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
drop(layers);
|
||||
|
||||
Ok(new_path)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compact_level0(&self, target_file_size: u64) -> Result<()> {
|
||||
@@ -2007,8 +1995,8 @@ impl LayeredTimeline {
|
||||
if writer.is_none() {
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
key,
|
||||
lsn_range.clone(),
|
||||
)?);
|
||||
@@ -2026,7 +2014,7 @@ impl LayeredTimeline {
|
||||
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
|
||||
|
||||
// also sync the directory
|
||||
layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid));
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
@@ -2036,38 +2024,18 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
|
||||
for l in new_layers {
|
||||
new_layer_paths.insert(l.path());
|
||||
layers.insert_historic(Arc::new(l));
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
// delete the old ones
|
||||
let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len());
|
||||
for l in level0_deltas {
|
||||
l.delete()?;
|
||||
if let Some(path) = l.local_path() {
|
||||
layer_paths_do_delete.insert(path);
|
||||
}
|
||||
layers.remove_historic(l);
|
||||
layers.remove_historic(l.clone());
|
||||
}
|
||||
drop(layers);
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
new_layer_paths,
|
||||
None,
|
||||
);
|
||||
storage_sync::schedule_layer_delete(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_do_delete,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2120,60 +2088,17 @@ impl LayeredTimeline {
|
||||
let cutoff = gc_info.cutoff;
|
||||
let pitr = gc_info.pitr;
|
||||
|
||||
// Calculate pitr cutoff point.
|
||||
// If we cannot determine a cutoff LSN, be conservative and don't GC anything.
|
||||
let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn();
|
||||
|
||||
if let Ok(timeline) =
|
||||
tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
|
||||
{
|
||||
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
||||
// If we don't have enough data to convert to LSN,
|
||||
// play safe and don't remove any layers.
|
||||
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
||||
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
||||
|
||||
match timeline.find_lsn_for_timestamp(pitr_timestamp)? {
|
||||
LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
debug!("future({})", lsn);
|
||||
pitr_cutoff_lsn = cutoff;
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
}
|
||||
}
|
||||
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
||||
}
|
||||
} else if cfg!(test) {
|
||||
// We don't have local timeline in mocked cargo tests.
|
||||
// So, just ignore pitr_interval setting in this case.
|
||||
pitr_cutoff_lsn = cutoff;
|
||||
}
|
||||
|
||||
let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn);
|
||||
|
||||
// Nothing to GC. Return early.
|
||||
if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff {
|
||||
info!(
|
||||
"Nothing to GC for timeline {}. cutoff_lsn {}",
|
||||
self.timeline_id, new_gc_cutoff
|
||||
);
|
||||
result.elapsed = now.elapsed()?;
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered();
|
||||
let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
|
||||
|
||||
// We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
|
||||
// See branch_timeline() for details.
|
||||
*self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff;
|
||||
*self.latest_gc_cutoff_lsn.write().unwrap() = cutoff;
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
debug!("retain_lsns: {:?}", retain_lsns);
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
let mut layers_to_remove: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
// Scan all on-disk layers in the timeline.
|
||||
//
|
||||
@@ -2207,18 +2132,30 @@ impl LayeredTimeline {
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > pitr_cutoff_lsn {
|
||||
debug!(
|
||||
"keeping {} because it's newer than pitr_cutoff_lsn {}",
|
||||
l.filename().display(),
|
||||
pitr_cutoff_lsn
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
// 2. It is newer than PiTR interval?
|
||||
// We use modification time of layer file to estimate update time.
|
||||
// This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill.
|
||||
// It is not expected that users will need high precision here. And this estimation
|
||||
// is conservative: modification time of file is always newer than actual time of version
|
||||
// creation. So it is safe for users.
|
||||
// TODO A possible "bloat" issue still persists here.
|
||||
// If modification time changes because of layer upload/download, we will keep these files
|
||||
// longer than necessary.
|
||||
// https://github.com/neondatabase/neon/issues/1554
|
||||
//
|
||||
if let Ok(metadata) = fs::metadata(&l.filename()) {
|
||||
let last_modified = metadata.modified()?;
|
||||
if now.duration_since(last_modified)? < pitr {
|
||||
debug!(
|
||||
"keeping {} because it's modification time {:?} is newer than PITR {:?}",
|
||||
l.filename().display(),
|
||||
last_modified,
|
||||
pitr
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Is it needed by a child branch?
|
||||
// NOTE With that wee would keep data that
|
||||
// might be referenced by child branches forever.
|
||||
@@ -2272,22 +2209,11 @@ impl LayeredTimeline {
|
||||
// Actually delete the layers from disk and remove them from the map.
|
||||
// (couldn't do this in the loop above, because you cannot modify a collection
|
||||
// while iterating it. BTreeMap::retain() would be another option)
|
||||
let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len());
|
||||
for doomed_layer in layers_to_remove {
|
||||
doomed_layer.delete()?;
|
||||
if let Some(path) = doomed_layer.local_path() {
|
||||
layer_paths_to_delete.insert(path);
|
||||
}
|
||||
layers.remove_historic(doomed_layer);
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
layers.remove_historic(doomed_layer.clone());
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_delete(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_delete,
|
||||
);
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
|
||||
result.elapsed = now.elapsed()?;
|
||||
@@ -2353,8 +2279,8 @@ impl LayeredTimeline {
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
cache.memorize_materialized_page(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
key,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
@@ -2436,26 +2362,6 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let metadata_path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that are specific to the layered storage format.
|
||||
///
|
||||
@@ -2490,19 +2396,9 @@ pub mod tests {
|
||||
|
||||
let err = harness.try_load().err().expect("should fail");
|
||||
assert_eq!(err.to_string(), "failed to load local metadata");
|
||||
|
||||
let mut found_error_message = false;
|
||||
let mut err_source = err.source();
|
||||
while let Some(source) = err_source {
|
||||
if source.to_string() == "metadata checksum mismatch" {
|
||||
found_error_message = true;
|
||||
break;
|
||||
}
|
||||
err_source = source.source();
|
||||
}
|
||||
assert!(
|
||||
found_error_message,
|
||||
"didn't find the corrupted metadata error"
|
||||
assert_eq!(
|
||||
err.source().unwrap().to_string(),
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -23,7 +23,6 @@ distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between.
|
||||
|
||||
```
|
||||
Cloud Storage Page Server Safekeeper
|
||||
L1 L0 Memory WAL
|
||||
|
||||
@@ -38,7 +37,6 @@ Cloud Storage Page Server Safekeeper
|
||||
+----+----+ +----+----+ | | |
|
||||
|EEEE| |EEEE|EEEE| +---+-----+
|
||||
+----+ +----+----+
|
||||
```
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
@@ -49,7 +47,7 @@ the same page and relation close to each other.
|
||||
From the page server memory, whenever enough WAL has been accumulated, it is flushed
|
||||
to disk into a new L0 layer file, and the memory is released.
|
||||
|
||||
When enough L0 files have been accumulated, they are merged together and sliced
|
||||
When enough L0 files have been accumulated, they are merged together rand sliced
|
||||
per key-space, producing a new set of files where each file contains a more
|
||||
narrow key range, but larger LSN range.
|
||||
|
||||
@@ -123,7 +121,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
`.zenith/tenants/<tenantid>/timelines`.
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
@@ -132,11 +130,8 @@ range of LSN.
|
||||
|
||||
image file:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
start key end key LSN
|
||||
```
|
||||
|
||||
|
||||
The first parts define the key range that the layer covers. See
|
||||
pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
|
||||
@@ -145,10 +140,8 @@ delta file:
|
||||
|
||||
Delta files are named similarly, but they cover a range of LSNs:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
start key end key start LSN end LSN
|
||||
```
|
||||
|
||||
A delta file contains all the key-values in the key-range that were updated in
|
||||
the LSN range. If a key has not been modified, there is no trace of it in the
|
||||
@@ -158,9 +151,7 @@ delta layer.
|
||||
A delta layer file can cover a part of the overall key space, as in the previous
|
||||
example, or the whole key range like this:
|
||||
|
||||
```
|
||||
000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
|
||||
```
|
||||
|
||||
A file that covers the whole key range is called a L0 file (Level 0), while a
|
||||
file that covers only part of the key range is called a L1 file. The "level" of
|
||||
@@ -177,9 +168,7 @@ version, and how branching and GC works is still valid.
|
||||
|
||||
The full path of a delta file looks like this:
|
||||
|
||||
```
|
||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
```
|
||||
|
||||
For simplicity, the examples below use a simplified notation for the
|
||||
paths. The tenant ID is left out, the timeline ID is replaced with
|
||||
@@ -188,10 +177,8 @@ with a human-readable table name. The LSNs are also shorter. For
|
||||
example, a base image file at LSN 100 and a delta file between 100-200
|
||||
for 'orders' table on 'main' branch is represented like this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
```
|
||||
|
||||
|
||||
# Creating layer files
|
||||
@@ -201,14 +188,12 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
of WAL is currently at LSN 250. In this starting situation, you would
|
||||
have these files on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
In addition to those files, the recent changes between LSN 200 and the
|
||||
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
@@ -239,7 +224,6 @@ If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last layer file, so the LSN ranges are always contiguous:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -252,7 +236,6 @@ last layer file, so the LSN ranges are always contiguous:
|
||||
main/customers_200
|
||||
main/customers_200_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
## Reading page versions
|
||||
|
||||
@@ -276,18 +259,15 @@ involves replaying any WAL records applicable to the page between LSNs
|
||||
|
||||
Imagine that a child branch is created at LSN 250:
|
||||
|
||||
```
|
||||
@250
|
||||
----main--+-------------------------->
|
||||
\
|
||||
+---child-------------->
|
||||
```
|
||||
|
||||
|
||||
Then, the 'orders' table is updated differently on the 'main' and
|
||||
'child' branches. You now have this situation on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -302,7 +282,6 @@ Then, the 'orders' table is updated differently on the 'main' and
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
@@ -315,7 +294,6 @@ is linear, and the request's LSN identifies unambiguously which file
|
||||
you need to look at. For example, the history for the 'orders' table
|
||||
on the 'main' branch consists of these files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -323,12 +301,10 @@ on the 'main' branch consists of these files:
|
||||
main/orders_300
|
||||
main/orders_300_400
|
||||
main/orders_400
|
||||
```
|
||||
|
||||
And from the 'child' branch's point of view, it consists of these
|
||||
files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -337,7 +313,6 @@ files:
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
The branch metadata includes the point where the child branch was
|
||||
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||
@@ -370,7 +345,6 @@ Let's look at the single branch scenario again. Imagine that the end
|
||||
of the branch is LSN 525, so that the GC horizon is currently at
|
||||
525-150 = 375
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -383,13 +357,11 @@ of the branch is LSN 525, so that the GC horizon is currently at
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
We can remove the following files because the end LSNs of those files are
|
||||
older than GC horizon 375, and there are more recent layer files for the
|
||||
table:
|
||||
|
||||
```
|
||||
main/orders_100 DELETE
|
||||
main/orders_100_200 DELETE
|
||||
main/orders_200 DELETE
|
||||
@@ -402,9 +374,8 @@ table:
|
||||
main/customers_100 DELETE
|
||||
main/customers_100_200 DELETE
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
'main/customers_200' is old enough, but it cannot be
|
||||
'main/customers_100_200' is old enough, but it cannot be
|
||||
removed because there is no newer layer file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
@@ -413,7 +384,6 @@ retain older shapshot files that are still needed by child branches.
|
||||
For example, if child branch is created at LSN 150, and the 'customers'
|
||||
table is updated on the branch, you would have these files:
|
||||
|
||||
```
|
||||
main/orders_100 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_100_200 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_200 DELETE
|
||||
@@ -428,7 +398,6 @@ table is updated on the branch, you would have these files:
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
child/customers_150_300 DELETE
|
||||
child/customers_300 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
|
||||
be removed, even though they are older than the GC horizon, because
|
||||
@@ -438,7 +407,6 @@ and 'main/orders_200_300' can still be removed.
|
||||
If 'orders' is modified later on the 'child' branch, we will create a
|
||||
new base image and delta file for it on the child:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
|
||||
@@ -451,7 +419,6 @@ new base image and delta file for it on the child:
|
||||
child/customers_300
|
||||
child/orders_150_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
After this, the 'main/orders_100' and 'main/orders_100_200' file could
|
||||
be removed. It is no longer needed by the child branch, because there
|
||||
@@ -467,7 +434,6 @@ Describe GC and checkpoint interval settings.
|
||||
In principle, each relation can be checkpointed separately, i.e. the
|
||||
LSN ranges of the files don't need to line up. So this would be legal:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -480,7 +446,6 @@ LSN ranges of the files don't need to line up. So this would be legal:
|
||||
main/customers_250
|
||||
main/customers_250_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
However, the code currently always checkpoints all relations together.
|
||||
So that situation doesn't arise in practice.
|
||||
@@ -503,13 +468,11 @@ does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
points. For example, if we start with this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/orders_200_300
|
||||
main/orders_300
|
||||
```
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150' to keep a
|
||||
|
||||
@@ -38,6 +38,10 @@ use crate::walrecord;
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
@@ -45,7 +49,6 @@ use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
@@ -215,10 +218,6 @@ impl Layer for DeltaLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -251,9 +250,6 @@ impl Layer for DeltaLayer {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
@@ -362,28 +358,6 @@ impl Layer for DeltaLayer {
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
@@ -392,10 +366,34 @@ impl Layer for DeltaLayer {
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
let mut desc = String::new();
|
||||
match cursor.read_blob(blob_ref.pos()) {
|
||||
Ok(buf) => {
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len()).unwrap();
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " READ ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
|
||||
@@ -125,10 +125,6 @@ impl Layer for ImageLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
@@ -85,10 +85,6 @@ impl Layer for InMemoryLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
@@ -211,7 +207,7 @@ impl Layer for InMemoryLayer {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
|
||||
@@ -132,15 +132,17 @@ impl LayerMap {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
trace!(
|
||||
"found layer {} for request on {key} at {end_lsn}",
|
||||
"found layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
}
|
||||
// this layer's end LSN is smaller than the requested point. If there's
|
||||
// nothing newer, this is what we need to return. Remember this.
|
||||
if let Some(old_candidate) = &latest_delta {
|
||||
if let Some(ref old_candidate) = latest_delta {
|
||||
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
@@ -150,8 +152,10 @@ impl LayerMap {
|
||||
}
|
||||
if let Some(l) = latest_delta {
|
||||
trace!(
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
"found (old) layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
@@ -162,13 +166,17 @@ impl LayerMap {
|
||||
layer: l,
|
||||
}))
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||
trace!(
|
||||
"found img layer and no deltas for request on {} at {}",
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
trace!("no layer found for request on {key} at {end_lsn}");
|
||||
trace!("no layer found for request on {} at {}", key, end_lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -186,6 +194,7 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
#[allow(dead_code)]
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let len_before = self.historic_layers.len();
|
||||
|
||||
@@ -244,7 +253,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
|
||||
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
}
|
||||
|
||||
|
||||
@@ -105,9 +105,6 @@ pub trait Layer: Send + Sync {
|
||||
/// log messages, even though they're never not on disk.)
|
||||
fn filename(&self) -> PathBuf;
|
||||
|
||||
/// If a layer has a corresponding file on a local filesystem, return its absolute path.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
|
||||
@@ -9,8 +9,8 @@ pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod reltag;
|
||||
pub mod remote_storage;
|
||||
pub mod repository;
|
||||
pub mod storage_sync;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
@@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
"pageserver_live_connections_count",
|
||||
"Number of live network connections",
|
||||
&["pageserver_connection_kind"]
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
@@ -43,14 +44,11 @@ use crate::CheckpointConfig;
|
||||
use metrics::{register_histogram_vec, HistogramVec};
|
||||
use postgres_ffi::xlog_utils::to_pg_timestamp;
|
||||
|
||||
use postgres_ffi::pg_constants;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -59,7 +57,6 @@ enum PagestreamBeMessage {
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -84,13 +81,6 @@ struct PagestreamGetPageRequest {
|
||||
blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamExistsResponse {
|
||||
exists: bool,
|
||||
@@ -111,11 +101,6 @@ struct PagestreamErrorResponse {
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeResponse {
|
||||
db_size: i64,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
@@ -157,11 +142,6 @@ impl PagestreamFeMessage {
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
@@ -192,10 +172,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -325,7 +301,7 @@ const TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
lazy_static! {
|
||||
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"pageserver_smgr_query_time",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
TIME_BUCKETS.into()
|
||||
@@ -391,11 +367,6 @@ impl PageServerHandler {
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_db_size", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_db_size_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
@@ -516,32 +487,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_db_size_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
||||
let mut total_blocks: i64 = 0;
|
||||
|
||||
for rel in all_rels {
|
||||
if rel.forknum == 0 {
|
||||
let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
|
||||
total_blocks += n_blocks as i64;
|
||||
}
|
||||
}
|
||||
|
||||
let db_size = total_blocks * pg_constants::BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
db_size,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
@@ -795,9 +740,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
// Use tenant's pitr setting
|
||||
let pitr = repo.get_pitr_interval();
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
|
||||
412
pageserver/src/remote_storage.rs
Normal file
412
pageserver/src/remote_storage.rs
Normal file
@@ -0,0 +1,412 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! This particular module serves as a public API border between pageserver and the internal storage machinery.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! There are a few components the storage machinery consists of:
|
||||
//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//! Synchronization internals are split into submodules
|
||||
//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
|
||||
//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
|
||||
//!
|
||||
//! * public API via to interact with the external world:
|
||||
//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
|
||||
//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
|
||||
//! to be processed by the async loop
|
||||
//!
|
||||
//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
|
||||
//!
|
||||
//! +------------------------+ +--------->-------+
|
||||
//! | | - - - (init async loop) - - - -> | |
|
||||
//! | | | |
|
||||
//! | | -------------------------------> | async |
|
||||
//! | pageserver | (enqueue timeline sync task) | upload/download |
|
||||
//! | | | loop |
|
||||
//! | | <------------------------------- | |
|
||||
//! | | (apply new timeline sync states) | |
|
||||
//! +------------------------+ +---------<-------+
|
||||
//! |
|
||||
//! |
|
||||
//! CRUD layer file operations |
|
||||
//! (upload/download/delete/list, etc.) |
|
||||
//! V
|
||||
//! +------------------------+
|
||||
//! | |
|
||||
//! | [`RemoteStorage`] impl |
|
||||
//! | |
|
||||
//! | pageserver assumes it |
|
||||
//! | owns exclusive write |
|
||||
//! | access to this storage |
|
||||
//! +------------------------+
|
||||
//!
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
|
||||
//! The loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
|
||||
//! query their downloads later if they are accessed.
|
||||
//!
|
||||
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
|
||||
//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
|
||||
//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Such submissions happen in two cases:
|
||||
//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
|
||||
//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
|
||||
//!
|
||||
//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files.
|
||||
//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
|
||||
//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
|
||||
//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer image is downloaded
|
||||
//!
|
||||
//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
|
||||
//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files.
|
||||
//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
|
||||
//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
|
||||
//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files.
|
||||
//!
|
||||
//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
|
||||
//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only,
|
||||
//! when a new timeline is scheduled for the download.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
|
||||
//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
|
||||
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
ffi, fs,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::io;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use self::storage_sync::TEMP_DOWNLOAD_EXTENSION;
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::S3Bucket,
|
||||
storage_sync::{
|
||||
download_index_part,
|
||||
index::{IndexPart, RemoteIndex, RemoteTimeline},
|
||||
schedule_timeline_checkpoint_upload, schedule_timeline_download,
|
||||
},
|
||||
};
|
||||
use crate::{
|
||||
config::{PageServerConf, RemoteStorageKind},
|
||||
layered_repository::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
},
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
/// after comparing local and remote timeline state.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum LocalTimelineInitStatus {
|
||||
/// The timeline has every remote layer present locally.
|
||||
/// There could be some layers requiring uploading,
|
||||
/// but this does not block the timeline from any user interaction.
|
||||
LocallyComplete,
|
||||
/// A timeline has some files remotely, that are not present locally and need downloading.
|
||||
/// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
|
||||
/// so the data needs to be downloaded first before the timeline can be used.
|
||||
NeedsSync,
|
||||
}
|
||||
|
||||
type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
|
||||
|
||||
/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
|
||||
/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
|
||||
/// to simplify the received code.
|
||||
pub struct SyncStartupData {
|
||||
pub remote_index: RemoteIndex,
|
||||
pub local_timeline_init_statuses: LocalTimelineInitStatuses,
|
||||
}
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
/// If no external configuration connection given, no thread or storage initialization is done.
|
||||
/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
|
||||
pub fn start_local_timeline_sync(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<SyncStartupData> {
|
||||
let local_timeline_files = local_tenant_timeline_files(config)
|
||||
.context("Failed to collect local tenant timeline files")?;
|
||||
|
||||
match &config.remote_storage_config {
|
||||
Some(storage_config) => match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
storage_config.max_concurrent_timelines_sync,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
},
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
S3Bucket::new(s3_config, &config.workdir)?,
|
||||
storage_config.max_concurrent_timelines_sync,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
},
|
||||
}
|
||||
.context("Failed to spawn the storage sync thread"),
|
||||
None => {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
|
||||
for (ZTenantTimelineId { tenant_id, timeline_id }, _) in
|
||||
local_timeline_files
|
||||
{
|
||||
local_timeline_init_statuses
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
|
||||
}
|
||||
Ok(SyncStartupData {
|
||||
local_timeline_init_statuses,
|
||||
remote_index: RemoteIndex::empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &tenants_dir_entry {
|
||||
Ok(tenants_dir_entry) => {
|
||||
match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
|
||||
Ok(collected_files) => {
|
||||
local_tenant_timeline_files.extend(collected_files.into_iter())
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
tenants_dir_entry,
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
tenants_dir_entry,
|
||||
tenants_dir.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(local_tenant_timeline_files)
|
||||
}
|
||||
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut timelines = HashMap::new();
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
let timelines_dir = config.timelines_path(&tenant_id);
|
||||
|
||||
for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to list timelines dir entry for tenant {}",
|
||||
tenant_id
|
||||
)
|
||||
})? {
|
||||
match timelines_dir_entry {
|
||||
Ok(timelines_dir_entry) => {
|
||||
let timeline_path = timelines_dir_entry.path();
|
||||
match collect_timeline_files(&timeline_path) {
|
||||
Ok((timeline_id, metadata, timeline_files)) => {
|
||||
timelines.insert(
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
(metadata, timeline_files),
|
||||
);
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to process timeline dir contents at '{}', reason: {:?}",
|
||||
timeline_path.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list timelines for entry tenant {}, reason: {:?}",
|
||||
tenant_id, e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
|
||||
// discover timeline files and extract timeline metadata
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
||||
let mut timeline_files = HashSet::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<ZTimelineId>()
|
||||
.context("Could not parse timeline id out of the timeline dir name")?;
|
||||
let timeline_dir_entries =
|
||||
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||
for entry in timeline_dir_entries {
|
||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||
timeline_metadata_path = Some(entry_path);
|
||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||
debug!("skipping ephemeral file {}", entry_path.display());
|
||||
continue;
|
||||
} else if entry_path.extension().and_then(ffi::OsStr::to_str)
|
||||
== Some(TEMP_DOWNLOAD_EXTENSION)
|
||||
{
|
||||
info!("removing temp download file at {}", entry_path.display());
|
||||
fs::remove_file(&entry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
timeline_files.insert(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
|
||||
// then attach is lost. There would be no retries for that,
|
||||
// initial collect will fail because there is no metadata.
|
||||
// We either need to start download if we see empty dir after restart or attach caller should
|
||||
// be aware of that and retry attach if awaits_download for timeline switched from true to false
|
||||
// but timelinne didnt appear locally.
|
||||
// Check what happens with remote index in that case.
|
||||
let timeline_metadata_path = match timeline_metadata_path {
|
||||
Some(path) => path,
|
||||
None => bail!("No metadata file found in the timeline directory"),
|
||||
};
|
||||
let metadata = TimelineMetadata::from_bytes(
|
||||
&fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
|
||||
)
|
||||
.context("Failed to parse timeline metadata file bytes")?;
|
||||
|
||||
Ok((timeline_id, metadata, timeline_files))
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type StoragePath;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
/// Immutable, cannot be changed once the file is created.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Local filesystem acting as a remote storage.
|
||||
//! Multiple API users can use the same "storage" of this kind by using different storage roots.
|
||||
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
|
||||
//!
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
@@ -17,18 +17,18 @@ use tokio::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use crate::path_with_suffix_extension;
|
||||
use crate::remote_storage::storage_sync::path_with_suffix_extension;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
pub struct LocalFs {
|
||||
working_directory: PathBuf,
|
||||
storage_root: PathBuf,
|
||||
pageserver_workdir: &'static Path,
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
|
||||
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
@@ -38,15 +38,15 @@ impl LocalFs {
|
||||
})?;
|
||||
}
|
||||
Ok(Self {
|
||||
working_directory,
|
||||
storage_root: root,
|
||||
pageserver_workdir,
|
||||
root,
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
if path.is_relative() {
|
||||
Ok(self.storage_root.join(path))
|
||||
} else if path.starts_with(&self.storage_root) {
|
||||
Ok(self.root.join(path))
|
||||
} else if path.starts_with(&self.root) {
|
||||
Ok(path.to_path_buf())
|
||||
} else {
|
||||
bail!(
|
||||
@@ -85,30 +85,30 @@ impl LocalFs {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
type RemoteObjectId = PathBuf;
|
||||
type StoragePath = PathBuf;
|
||||
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
|
||||
Ok(self.storage_root.join(
|
||||
strip_path_prefix(&self.working_directory, local_path)
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
Ok(self.root.join(
|
||||
strip_path_prefix(self.pageserver_workdir, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = strip_path_prefix(&self.storage_root, storage_path)
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = strip_path_prefix(&self.root, storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
Ok(self.working_directory.join(relative_path))
|
||||
Ok(self.pageserver_workdir.join(relative_path))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
get_all_files(&self.storage_root).await
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
get_all_files(&self.root).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
@@ -194,7 +194,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
@@ -229,9 +229,9 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
@@ -288,7 +288,7 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
@@ -354,30 +354,29 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use tempfile::tempdir;
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
|
||||
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -387,7 +386,7 @@ mod pure_tests {
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected path '{}' to error, but got storage path: {:?}",
|
||||
mismatching_path.display(),
|
||||
@@ -397,16 +396,16 @@ mod pure_tests {
|
||||
}
|
||||
}
|
||||
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root,
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let error_string = storage_path_error(&storage, &workdir);
|
||||
let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(error_string.contains("does not belong to this storage"));
|
||||
assert!(error_string.contains(workdir.to_str().unwrap()));
|
||||
assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
|
||||
|
||||
let mismatching_path_str = "/something/else";
|
||||
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
|
||||
@@ -415,7 +414,7 @@ mod pure_tests {
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(error_message.contains("does not belong to this storage"));
|
||||
@@ -425,28 +424,29 @@ mod pure_tests {
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let name = "not a metadata";
|
||||
let local_path = workdir.join("timelines").join("some_timeline").join(name);
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
|
||||
assert_eq!(
|
||||
local_path,
|
||||
storage
|
||||
.local_path(&storage_root.join(local_path.strip_prefix(&workdir)?))
|
||||
.local_path(
|
||||
&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
|
||||
)
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let local_metadata_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("metadata");
|
||||
let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
|
||||
let local_metadata_path = repo_harness
|
||||
.timeline_path(&TIMELINE_ID)
|
||||
.join(METADATA_FILE_NAME);
|
||||
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
local_metadata_path,
|
||||
storage
|
||||
@@ -472,10 +472,11 @@ mod pure_tests {
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("local_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: tempdir()?.path().to_owned(),
|
||||
storage_root,
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
@@ -487,19 +488,16 @@ mod pure_tests {
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let dummy_storage = LocalFs {
|
||||
working_directory: workdir,
|
||||
storage_root,
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.remote_object_id(&original_path)?;
|
||||
let storage_path = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&storage_path)?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -514,17 +512,18 @@ mod pure_tests {
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use std::{collections::HashMap, io::Write};
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let repo_harness = RepoHarness::create("upload_file")?;
|
||||
let storage = create_storage()?;
|
||||
|
||||
let (file, size) = create_file_for_upload(
|
||||
&storage.working_directory.join("whatever"),
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
@@ -539,14 +538,14 @@ mod fs_tests {
|
||||
}
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
|
||||
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
|
||||
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
@@ -557,16 +556,17 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
|
||||
let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
|
||||
Ok(storage)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("download_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
@@ -597,15 +597,14 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("download_file_range_positive")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
|
||||
|
||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.download_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
@@ -621,7 +620,7 @@ mod fs_tests {
|
||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let same_byte = 1_000_000_000;
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
.download_range(
|
||||
&upload_target,
|
||||
same_byte,
|
||||
Some(same_byte + 1), // exclusive end
|
||||
@@ -643,7 +642,7 @@ mod fs_tests {
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
.download_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
@@ -665,7 +664,7 @@ mod fs_tests {
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
.download_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
@@ -690,17 +689,16 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("download_file_range_negative")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
|
||||
|
||||
let start = 10000;
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_byte_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.download_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
@@ -714,7 +712,7 @@ mod fs_tests {
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.download_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
||||
@@ -729,11 +727,10 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("delete_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
@@ -751,8 +748,7 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn file_with_metadata() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let repo_harness = RepoHarness::create("download_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let metadata = StorageMetadata(HashMap::from([
|
||||
@@ -760,7 +756,7 @@ mod fs_tests {
|
||||
("two".to_string(), "2".to_string()),
|
||||
]));
|
||||
let upload_target =
|
||||
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
||||
upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
@@ -784,7 +780,7 @@ mod fs_tests {
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let partial_download_metadata = storage
|
||||
.download_byte_range(
|
||||
.download_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
@@ -809,16 +805,16 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
workdir: &Path,
|
||||
harness: &RepoHarness<'_>,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = workdir.join("timelines").join("some_timeline");
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
|
||||
let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
|
||||
let storage_path = storage.root.join(relative_timeline_path).join(name);
|
||||
|
||||
let from_path = storage.working_directory.join(name);
|
||||
let from_path = storage.pageserver_workdir.join(name);
|
||||
let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
|
||||
storage.upload(file, size, &storage_path, metadata).await?;
|
||||
Ok(storage_path)
|
||||
@@ -1,7 +1,7 @@
|
||||
//! AWS S3 storage wrapper around `rusoto` library.
|
||||
//!
|
||||
//! Respects `prefix_in_bucket` property from [`S3Config`],
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! allowing multiple pageservers to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -19,13 +19,16 @@ use tokio::{io, sync::Semaphore};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{strip_path_prefix, RemoteStorage, S3Config};
|
||||
use crate::{
|
||||
config::S3Config,
|
||||
remote_storage::{strip_path_prefix, RemoteStorage},
|
||||
};
|
||||
|
||||
use super::StorageMetadata;
|
||||
|
||||
const S3_PREFIX_SEPARATOR: char = '/';
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
@@ -33,7 +36,11 @@ impl S3ObjectKey {
|
||||
&self.0
|
||||
}
|
||||
|
||||
fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf {
|
||||
fn download_destination(
|
||||
&self,
|
||||
pageserver_workdir: &Path,
|
||||
prefix_to_strip: Option<&str>,
|
||||
) -> PathBuf {
|
||||
let path_without_prefix = match prefix_to_strip {
|
||||
Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
|
||||
panic!(
|
||||
@@ -44,9 +51,9 @@ impl S3ObjectKey {
|
||||
None => &self.0,
|
||||
};
|
||||
|
||||
workdir.join(
|
||||
pageserver_workdir.join(
|
||||
path_without_prefix
|
||||
.split(S3_PREFIX_SEPARATOR)
|
||||
.split(S3_FILE_SEPARATOR)
|
||||
.collect::<PathBuf>(),
|
||||
)
|
||||
}
|
||||
@@ -54,7 +61,7 @@ impl S3ObjectKey {
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
workdir: PathBuf,
|
||||
pageserver_workdir: &'static Path,
|
||||
client: S3Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
@@ -66,7 +73,7 @@ pub struct S3Bucket {
|
||||
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
@@ -82,11 +89,8 @@ impl S3Bucket {
|
||||
.context("Failed to parse the s3 region from config")?,
|
||||
};
|
||||
let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
|
||||
|
||||
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
|
||||
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
|
||||
|
||||
let client = if access_key_id.is_none() && secret_access_key.is_none() {
|
||||
let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
|
||||
{
|
||||
debug!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
@@ -94,8 +98,8 @@ impl S3Bucket {
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new_minimal(
|
||||
access_key_id.unwrap_or_default(),
|
||||
secret_access_key.unwrap_or_default(),
|
||||
aws_config.access_key_id.clone().unwrap_or_default(),
|
||||
aws_config.secret_access_key.clone().unwrap_or_default(),
|
||||
),
|
||||
region,
|
||||
)
|
||||
@@ -103,12 +107,12 @@ impl S3Bucket {
|
||||
|
||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||
let mut prefix = prefix;
|
||||
while prefix.starts_with(S3_PREFIX_SEPARATOR) {
|
||||
while prefix.starts_with(S3_FILE_SEPARATOR) {
|
||||
prefix = &prefix[1..]
|
||||
}
|
||||
|
||||
let mut prefix = prefix.to_string();
|
||||
while prefix.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
while prefix.ends_with(S3_FILE_SEPARATOR) {
|
||||
prefix.pop();
|
||||
}
|
||||
prefix
|
||||
@@ -116,7 +120,7 @@ impl S3Bucket {
|
||||
|
||||
Ok(Self {
|
||||
client,
|
||||
workdir,
|
||||
pageserver_workdir,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
@@ -126,23 +130,24 @@ impl S3Bucket {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
type RemoteObjectId = S3ObjectKey;
|
||||
type StoragePath = S3ObjectKey;
|
||||
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
|
||||
let relative_path = strip_path_prefix(&self.workdir, local_path)?;
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
|
||||
let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in relative_path {
|
||||
key.push(S3_PREFIX_SEPARATOR);
|
||||
key.push(S3_FILE_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref()))
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
Ok(storage_path
|
||||
.download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref()))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
@@ -182,7 +187,7 @@ impl RemoteStorage for S3Bucket {
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
@@ -207,7 +212,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let _guard = self
|
||||
@@ -232,9 +237,9 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
@@ -269,7 +274,7 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
@@ -288,30 +293,34 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn download_destination() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let local_path = workdir.join("one").join("two").join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&workdir)?;
|
||||
let repo_harness = RepoHarness::create("download_destination")?;
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
|
||||
let key = S3ObjectKey(format!(
|
||||
"{}{}",
|
||||
S3_PREFIX_SEPARATOR,
|
||||
S3_FILE_SEPARATOR,
|
||||
relative_path
|
||||
.iter()
|
||||
.map(|segment| segment.to_str().unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.join(&S3_PREFIX_SEPARATOR.to_string()),
|
||||
.join(&S3_FILE_SEPARATOR.to_string()),
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
local_path,
|
||||
key.download_destination(&workdir, None),
|
||||
"Download destination should consist of s3 path joined with the workdir prefix"
|
||||
key.download_destination(&repo_harness.conf.workdir, None),
|
||||
"Download destination should consist of s3 path joined with the pageserver workdir prefix"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -319,21 +328,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "file";
|
||||
let local_path = &workdir.join(segment_1).join(segment_2);
|
||||
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
|
||||
|
||||
let storage = dummy_storage(workdir);
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let expected_key = S3ObjectKey(format!(
|
||||
"{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}",
|
||||
"{}{SEPARATOR}{}{SEPARATOR}{}",
|
||||
storage.prefix_in_bucket.as_deref().unwrap_or_default(),
|
||||
segment_1,
|
||||
segment_2,
|
||||
SEPARATOR = S3_FILE_SEPARATOR,
|
||||
));
|
||||
|
||||
let actual_key = storage
|
||||
.remote_object_id(local_path)
|
||||
.storage_path(local_path)
|
||||
.expect("Matching path should map to S3 path normally");
|
||||
assert_eq!(
|
||||
expected_key,
|
||||
@@ -348,7 +360,7 @@ mod tests {
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_key) => panic!(
|
||||
"Expected path '{}' to error, but got S3 key: {:?}",
|
||||
mismatching_path.display(),
|
||||
@@ -358,10 +370,10 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let error_message = storage_path_error(&storage, &workdir);
|
||||
let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(
|
||||
error_message.contains("Prefix and the path are equal"),
|
||||
"Message '{}' does not contain the required string",
|
||||
@@ -375,7 +387,7 @@ mod tests {
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(
|
||||
@@ -389,17 +401,20 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
let timeline_dir = workdir.join("timelines").join("test_timeline");
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("not a metadata"),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
|
||||
s3_key.download_destination(
|
||||
&repo_harness.conf.workdir,
|
||||
storage.prefix_in_bucket.as_deref()
|
||||
),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
@@ -407,11 +422,14 @@ mod tests {
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("metadata"),
|
||||
&relative_timeline_path.join(METADATA_FILE_NAME),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
|
||||
s3_key.download_destination(
|
||||
&repo_harness.conf.workdir,
|
||||
storage.prefix_in_bucket.as_deref()
|
||||
),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
@@ -423,15 +441,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let dummy_storage = dummy_storage(workdir);
|
||||
let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let key = dummy_storage.remote_object_id(&original_path)?;
|
||||
let key = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&key)?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -442,9 +457,9 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dummy_storage(workdir: PathBuf) -> S3Bucket {
|
||||
fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket {
|
||||
S3Bucket {
|
||||
workdir,
|
||||
pageserver_workdir,
|
||||
client: S3Client::new("us-east-1".parse().unwrap()),
|
||||
bucket_name: "dummy-bucket".to_string(),
|
||||
prefix_in_bucket: Some("dummy_prefix/".to_string()),
|
||||
@@ -456,7 +471,7 @@ mod tests {
|
||||
S3ObjectKey(relative_file_path.iter().fold(
|
||||
prefix.unwrap_or_default().to_string(),
|
||||
|mut path_string, segment| {
|
||||
path_string.push(S3_PREFIX_SEPARATOR);
|
||||
path_string.push(S3_FILE_SEPARATOR);
|
||||
path_string.push_str(segment.to_str().unwrap());
|
||||
path_string
|
||||
},
|
||||
1663
pageserver/src/remote_storage/storage_sync.rs
Normal file
1663
pageserver/src/remote_storage/storage_sync.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,223 +0,0 @@
|
||||
//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage.
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tracing::{debug, error, info};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use crate::remote_storage::{
|
||||
storage_sync::{SyncQueue, SyncTask},
|
||||
RemoteStorage,
|
||||
};
|
||||
|
||||
use super::{LayersDeletion, SyncData};
|
||||
|
||||
/// Attempts to remove the timleline layers from the remote storage.
|
||||
/// If the task had not adjusted the metadata before, the deletion will fail.
|
||||
pub(super) async fn delete_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
sync_queue: &SyncQueue,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut delete_data: SyncData<LayersDeletion>,
|
||||
) -> bool
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
if !delete_data.data.deletion_registered {
|
||||
error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
return false;
|
||||
}
|
||||
|
||||
if delete_data.data.layers_to_delete.is_empty() {
|
||||
info!("No layers to delete, skipping");
|
||||
return true;
|
||||
}
|
||||
|
||||
let layers_to_delete = delete_data
|
||||
.data
|
||||
.layers_to_delete
|
||||
.drain()
|
||||
.collect::<Vec<_>>();
|
||||
debug!("Layers to delete: {layers_to_delete:?}");
|
||||
info!("Deleting {} timeline layers", layers_to_delete.len());
|
||||
|
||||
let mut delete_tasks = layers_to_delete
|
||||
.into_iter()
|
||||
.map(|local_layer_path| async {
|
||||
let storage_path = match storage.storage_path(&local_layer_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(path) => path,
|
||||
Err(e) => return Err((e, local_layer_path)),
|
||||
};
|
||||
|
||||
match storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
}) {
|
||||
Ok(()) => Ok(local_layer_path),
|
||||
Err(e) => Err((e, local_layer_path)),
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errored = false;
|
||||
while let Some(deletion_result) = delete_tasks.next().await {
|
||||
match deletion_result {
|
||||
Ok(local_layer_path) => {
|
||||
debug!(
|
||||
"Successfully deleted layer {} for timeline {sync_id}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.deleted_layers.insert(local_layer_path);
|
||||
}
|
||||
Err((e, local_layer_path)) => {
|
||||
errored = true;
|
||||
error!(
|
||||
"Failed to delete layer {} for timeline {sync_id}: {e:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.layers_to_delete.insert(local_layer_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if errored {
|
||||
debug!("Reenqueuing failed delete task for timeline {sync_id}");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
}
|
||||
errored
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashSet, num::NonZeroUsize};
|
||||
|
||||
use itertools::Itertools;
|
||||
use tempfile::tempdir;
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline_negative() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline_negative")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: 1,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::new(),
|
||||
deletion_registered: false,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
!deleted,
|
||||
"Should not start the deletion for task with delete metadata unregistered"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "c", "d"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.storage_path(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
assert_eq!(
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_str| layer_str.to_string())
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
"Expect to have all layer files remotely before deletion"
|
||||
);
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: current_retries,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::from([
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("something_different"),
|
||||
]),
|
||||
deletion_registered: true,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
assert!(deleted, "Should be able to delete timeline files");
|
||||
|
||||
assert_eq!(
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["b".to_string(), "d".to_string()],
|
||||
"Expect to have only non-deleted files remotely"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@ use std::{collections::HashSet, fmt::Debug, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::{path_with_suffix_extension, RemoteStorage};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
@@ -12,13 +11,18 @@ use tokio::{
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::metadata_path,
|
||||
remote_storage::{
|
||||
storage_sync::{path_with_suffix_extension, sync_queue, SyncTask},
|
||||
RemoteStorage,
|
||||
},
|
||||
};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersDownload, SyncData, SyncQueue,
|
||||
SyncData, TimelineDownload,
|
||||
};
|
||||
|
||||
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
@@ -31,19 +35,17 @@ pub async fn download_index_part<P, S>(
|
||||
) -> anyhow::Result<IndexPart>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let part_storage_path = storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
let part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut index_part_bytes = Vec::new();
|
||||
storage
|
||||
.download(&part_storage_path, &mut index_part_bytes)
|
||||
@@ -74,7 +76,7 @@ pub(super) enum DownloadedTimeline {
|
||||
FailedAndRescheduled,
|
||||
/// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
|
||||
/// Initial download successful.
|
||||
Successful(SyncData<LayersDownload>),
|
||||
Successful(SyncData<TimelineDownload>),
|
||||
}
|
||||
|
||||
/// Attempts to download all given timeline's layers.
|
||||
@@ -85,14 +87,13 @@ pub(super) enum DownloadedTimeline {
|
||||
pub(super) async fn download_timeline_layers<'a, P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a S,
|
||||
sync_queue: &'a SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut download_data: SyncData<LayersDownload>,
|
||||
mut download_data: SyncData<TimelineDownload>,
|
||||
) -> DownloadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let remote_timeline = match remote_timeline {
|
||||
Some(remote_timeline) => {
|
||||
@@ -119,11 +120,6 @@ where
|
||||
debug!("Layers to download: {layers_to_download:?}");
|
||||
info!("Downloading {} timeline layers", layers_to_download.len());
|
||||
|
||||
if layers_to_download.is_empty() {
|
||||
info!("No layers to download after filtering, skipping");
|
||||
return DownloadedTimeline::Successful(download_data);
|
||||
}
|
||||
|
||||
let mut download_tasks = layers_to_download
|
||||
.into_iter()
|
||||
.map(|layer_desination_path| async move {
|
||||
@@ -134,7 +130,7 @@ where
|
||||
);
|
||||
} else {
|
||||
let layer_storage_path = storage
|
||||
.remote_object_id(&layer_desination_path)
|
||||
.storage_path(&layer_desination_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
@@ -250,7 +246,7 @@ where
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed download task for timeline {sync_id}");
|
||||
download_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||
sync_queue::push(sync_id, SyncTask::Download(download_data));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
} else {
|
||||
info!("Successfully downloaded all layers");
|
||||
@@ -264,21 +260,20 @@ async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::{BTreeSet, HashSet},
|
||||
num::NonZeroUsize,
|
||||
};
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
@@ -286,14 +281,9 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn download_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
@@ -301,7 +291,7 @@ mod tests {
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.remote_object_id(&local_path)?;
|
||||
let remote_path = storage.storage_path(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
@@ -328,12 +318,11 @@ mod tests {
|
||||
let download_data = match download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
Some(&remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
current_retries,
|
||||
LayersDownload {
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
|
||||
},
|
||||
),
|
||||
@@ -385,19 +374,17 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn download_timeline_negatives() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline_negatives")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
|
||||
let empty_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
LayersDownload {
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
@@ -416,12 +403,11 @@ mod tests {
|
||||
let already_downloading_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
Some(¬_expecting_download_remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
LayersDownload {
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
@@ -443,10 +429,7 @@ mod tests {
|
||||
let harness = RepoHarness::create("test_download_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
@@ -467,7 +450,7 @@ mod tests {
|
||||
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let storage_path = storage.remote_object_id(&local_index_part_path)?;
|
||||
let storage_path = storage.storage_path(&local_index_part_path)?;
|
||||
fs::create_dir_all(storage_path.parent().unwrap()).await?;
|
||||
fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::{
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Ok};
|
||||
use anyhow::{Context, Ok};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
@@ -113,7 +113,7 @@ impl RemoteTimelineIndex {
|
||||
awaits_download: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.timeline_entry_mut(id)
|
||||
.ok_or_else(|| anyhow!("unknown timeline sync {id}"))?
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
|
||||
.awaits_download = awaits_download;
|
||||
Ok(())
|
||||
}
|
||||
@@ -147,13 +147,6 @@ impl RemoteTimeline {
|
||||
self.missing_layers.extend(upload_failures.into_iter());
|
||||
}
|
||||
|
||||
pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
|
||||
self.timeline_layers
|
||||
.retain(|layer| !layers_to_remove.contains(layer));
|
||||
self.missing_layers
|
||||
.retain(|layer| !layers_to_remove.contains(layer));
|
||||
}
|
||||
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self) -> &HashSet<PathBuf> {
|
||||
&self.timeline_layers
|
||||
@@ -4,19 +4,20 @@ use std::{fmt::Debug, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::RemoteStorage;
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::metadata_path,
|
||||
remote_storage::{
|
||||
storage_sync::{index::RemoteTimeline, sync_queue, SyncTask},
|
||||
RemoteStorage,
|
||||
},
|
||||
};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersUpload, SyncData, SyncQueue,
|
||||
};
|
||||
use crate::{
|
||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||
};
|
||||
use super::{index::IndexPart, SyncData, TimelineUpload};
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<P, S>(
|
||||
@@ -27,7 +28,7 @@ pub(super) async fn upload_index_part<P, S>(
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
@@ -37,15 +38,12 @@ where
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let index_part_storage_path =
|
||||
storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(
|
||||
@@ -66,7 +64,11 @@ pub(super) enum UploadedTimeline {
|
||||
/// Upload failed due to some error, the upload task is rescheduled for another retry.
|
||||
FailedAndRescheduled,
|
||||
/// No issues happened during the upload, all task files were put into the remote storage.
|
||||
Successful(SyncData<LayersUpload>),
|
||||
Successful(SyncData<TimelineUpload>),
|
||||
/// No failures happened during the upload, but some files were removed locally before the upload task completed
|
||||
/// (could happen due to retries, for instance, if GC happens in the interim).
|
||||
/// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file.
|
||||
SuccessfulAfterLocalFsUpdate(SyncData<TimelineUpload>),
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
@@ -75,20 +77,16 @@ pub(super) enum UploadedTimeline {
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
sync_queue: &SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut upload_data: SyncData<LayersUpload>,
|
||||
mut upload_data: SyncData<TimelineUpload>,
|
||||
) -> UploadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let upload = &mut upload_data.data;
|
||||
let new_upload_lsn = upload
|
||||
.metadata
|
||||
.as_ref()
|
||||
.map(|meta| meta.disk_consistent_lsn());
|
||||
let new_upload_lsn = upload.metadata.disk_consistent_lsn();
|
||||
|
||||
let already_uploaded_layers = remote_timeline
|
||||
.map(|timeline| timeline.stored_files())
|
||||
@@ -101,14 +99,9 @@ where
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if layers_to_upload.is_empty() {
|
||||
info!("No layers to upload after filtering, aborting");
|
||||
return UploadedTimeline::Successful(upload_data);
|
||||
}
|
||||
|
||||
debug!("Layers to upload: {layers_to_upload:?}");
|
||||
info!(
|
||||
"Uploading {} timeline layers, new lsn: {new_upload_lsn:?}",
|
||||
"Uploading {} timeline layers, new lsn: {new_upload_lsn}",
|
||||
layers_to_upload.len(),
|
||||
);
|
||||
|
||||
@@ -116,7 +109,7 @@ where
|
||||
.into_iter()
|
||||
.map(|source_path| async move {
|
||||
let storage_path = storage
|
||||
.remote_object_id(&source_path)
|
||||
.storage_path(&source_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
@@ -163,6 +156,7 @@ where
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errors_happened = false;
|
||||
let mut local_fs_updated = false;
|
||||
while let Some(upload_result) = upload_tasks.next().await {
|
||||
match upload_result {
|
||||
Ok(uploaded_path) => {
|
||||
@@ -179,16 +173,7 @@ where
|
||||
errors_happened = true;
|
||||
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
|
||||
} else {
|
||||
// We have run the upload sync task, but the file we wanted to upload is gone.
|
||||
// This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
|
||||
// retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and
|
||||
// run compaction/gc threads, removing redundant files from disk.
|
||||
// It's not good to pause GC/compaction because of those and we would rather skip such uploads.
|
||||
//
|
||||
// Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance).
|
||||
// We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet.
|
||||
// This will create "missing" layers and make data inconsistent.
|
||||
// Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result.
|
||||
local_fs_updated = true;
|
||||
upload.layers_to_upload.remove(&source_path);
|
||||
warn!(
|
||||
"Missing locally a layer file {} scheduled for upload, skipping",
|
||||
@@ -203,8 +188,11 @@ where
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed upload task for timeline {sync_id}");
|
||||
upload_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Upload(upload_data));
|
||||
sync_queue::push(sync_id, SyncTask::Upload(upload_data));
|
||||
UploadedTimeline::FailedAndRescheduled
|
||||
} else if local_fs_updated {
|
||||
info!("Successfully uploaded all layers, some local layers were removed during the upload");
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data)
|
||||
} else {
|
||||
info!("Successfully uploaded all layers");
|
||||
UploadedTimeline::Successful(upload_data)
|
||||
@@ -218,21 +206,20 @@ enum UploadError {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::{BTreeSet, HashSet},
|
||||
num::NonZeroUsize,
|
||||
};
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use remote_storage::LocalFs;
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::{upload_index_part, *};
|
||||
@@ -240,21 +227,15 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn regular_layer_upload() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("regular_layer_upload")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let mut timeline_upload =
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
timeline_upload.metadata = None;
|
||||
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
@@ -262,7 +243,6 @@ mod tests {
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
@@ -298,8 +278,8 @@ mod tests {
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata, None,
|
||||
"Successful upload without metadata should not have it returned either"
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
@@ -327,11 +307,10 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let current_retries = 5;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
|
||||
@@ -353,7 +332,6 @@ mod tests {
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
@@ -361,7 +339,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful after local fs upload for timeline, but got: {wrong_result:?}"
|
||||
),
|
||||
@@ -389,8 +367,7 @@ mod tests {
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata,
|
||||
Some(metadata),
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
@@ -420,7 +397,7 @@ mod tests {
|
||||
let harness = RepoHarness::create("test_upload_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::layered_repository::metadata::TimelineMetadata;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::remote_storage::RemoteIndex;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::{bail, Result};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,228 +0,0 @@
|
||||
//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage.
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::storage_sync::{SyncQueue, SyncTask};
|
||||
use remote_storage::RemoteStorage;
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{LayersDeletion, SyncData};
|
||||
|
||||
/// Attempts to remove the timleline layers from the remote storage.
|
||||
/// If the task had not adjusted the metadata before, the deletion will fail.
|
||||
pub(super) async fn delete_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
sync_queue: &SyncQueue,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut delete_data: SyncData<LayersDeletion>,
|
||||
) -> bool
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
if !delete_data.data.deletion_registered {
|
||||
error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
return false;
|
||||
}
|
||||
|
||||
if delete_data.data.layers_to_delete.is_empty() {
|
||||
info!("No layers to delete, skipping");
|
||||
return true;
|
||||
}
|
||||
|
||||
let layers_to_delete = delete_data
|
||||
.data
|
||||
.layers_to_delete
|
||||
.drain()
|
||||
.collect::<Vec<_>>();
|
||||
debug!("Layers to delete: {layers_to_delete:?}");
|
||||
info!("Deleting {} timeline layers", layers_to_delete.len());
|
||||
|
||||
let mut delete_tasks = layers_to_delete
|
||||
.into_iter()
|
||||
.map(|local_layer_path| async {
|
||||
let storage_path =
|
||||
match storage
|
||||
.remote_object_id(&local_layer_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(path) => path,
|
||||
Err(e) => return Err((e, local_layer_path)),
|
||||
};
|
||||
|
||||
match storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
}) {
|
||||
Ok(()) => Ok(local_layer_path),
|
||||
Err(e) => Err((e, local_layer_path)),
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errored = false;
|
||||
while let Some(deletion_result) = delete_tasks.next().await {
|
||||
match deletion_result {
|
||||
Ok(local_layer_path) => {
|
||||
debug!(
|
||||
"Successfully deleted layer {} for timeline {sync_id}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.deleted_layers.insert(local_layer_path);
|
||||
}
|
||||
Err((e, local_layer_path)) => {
|
||||
errored = true;
|
||||
error!(
|
||||
"Failed to delete layer {} for timeline {sync_id}: {e:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.layers_to_delete.insert(local_layer_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if errored {
|
||||
debug!("Reenqueuing failed delete task for timeline {sync_id}");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
}
|
||||
errored
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashSet, num::NonZeroUsize};
|
||||
|
||||
use itertools::Itertools;
|
||||
use tempfile::tempdir;
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
|
||||
};
|
||||
use remote_storage::LocalFs;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline_negative() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline_negative")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: 1,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::new(),
|
||||
deletion_registered: false,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
!deleted,
|
||||
"Should not start the deletion for task with delete metadata unregistered"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("delete_timeline")?;
|
||||
let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "c", "d"];
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.remote_object_id(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
assert_eq!(
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_str| layer_str.to_string())
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
"Expect to have all layer files remotely before deletion"
|
||||
);
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: current_retries,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::from([
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("something_different"),
|
||||
]),
|
||||
deletion_registered: true,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
assert!(deleted, "Should be able to delete timeline files");
|
||||
|
||||
assert_eq!(
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["b".to_string(), "d".to_string()],
|
||||
"Expect to have only non-deleted files remotely"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,9 +4,8 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::remote_storage::{self, LocalTimelineInitStatus, RemoteIndex, SyncStartupData};
|
||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -78,9 +77,6 @@ pub enum TenantState {
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
// The tenant cannot be accessed anymore for any reason, but graceful shutdown.
|
||||
Stopping,
|
||||
|
||||
// Something went wrong loading the tenant state
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
@@ -89,7 +85,6 @@ impl fmt::Display for TenantState {
|
||||
TenantState::Active => f.write_str("Active"),
|
||||
TenantState::Idle => f.write_str("Idle"),
|
||||
TenantState::Stopping => f.write_str("Stopping"),
|
||||
TenantState::Broken => f.write_str("Broken"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -101,24 +96,9 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = storage_sync::start_local_timeline_sync(conf)
|
||||
} = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
if let Err(err) =
|
||||
init_local_repository(conf, tenant_id, local_timeline_init_statuses, &remote_index)
|
||||
{
|
||||
// Report the error, but continue with the startup for other tenants. An error
|
||||
// loading a tenant is serious, but it's better to complete the startup and
|
||||
// serve other tenants, than fail completely.
|
||||
error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
|
||||
let mut m = tenants_state::write_tenants();
|
||||
if let Some(tenant) = m.get_mut(&tenant_id) {
|
||||
tenant.state = TenantState::Broken;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
init_local_repositories(conf, local_timeline_init_statuses, &remote_index)?;
|
||||
Ok(remote_index)
|
||||
}
|
||||
|
||||
@@ -162,13 +142,8 @@ pub fn shutdown_all_tenants() {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let mut tenantids = Vec::new();
|
||||
for (tenantid, tenant) in m.iter_mut() {
|
||||
match tenant.state {
|
||||
TenantState::Active | TenantState::Idle | TenantState::Stopping => {
|
||||
tenant.state = TenantState::Stopping;
|
||||
tenantids.push(*tenantid)
|
||||
}
|
||||
TenantState::Broken => {}
|
||||
}
|
||||
tenant.state = TenantState::Stopping;
|
||||
tenantids.push(*tenantid)
|
||||
}
|
||||
drop(m);
|
||||
|
||||
@@ -269,7 +244,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"Compactor thread",
|
||||
false,
|
||||
true,
|
||||
move || crate::tenant_threads::compact_loop(tenant_id),
|
||||
)?;
|
||||
|
||||
@@ -278,7 +253,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"GC thread",
|
||||
false,
|
||||
true,
|
||||
move || crate::tenant_threads::gc_loop(tenant_id),
|
||||
)
|
||||
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
|
||||
@@ -294,10 +269,6 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
TenantState::Stopping => {
|
||||
// don't re-activate it if it's being stopped
|
||||
}
|
||||
|
||||
TenantState::Broken => {
|
||||
// cannot activate
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -398,37 +369,38 @@ pub fn list_tenants() -> Vec<TenantInfo> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn init_local_repository(
|
||||
fn init_local_repositories(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
|
||||
local_timeline_init_statuses: HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> anyhow::Result<(), anyhow::Error> {
|
||||
// initialize local tenant
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
// initialize local tenant
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||
|
||||
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
||||
so skipped for adding into repository until sync is finished"
|
||||
);
|
||||
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
||||
so skipped for adding into repository until sync is finished"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
apply_timeline_remote_sync_status_updates(&repo, status_updates)
|
||||
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
apply_timeline_remote_sync_status_updates(&repo, status_updates)
|
||||
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -23,8 +23,8 @@ use utils::{
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::TimelineMetadata,
|
||||
remote_storage::RemoteIndex,
|
||||
repository::{LocalTimelineState, Repository},
|
||||
storage_sync::index::RemoteIndex,
|
||||
tenant_config::TenantConfOpt,
|
||||
DatadirTimeline, RepositoryImpl,
|
||||
};
|
||||
|
||||
@@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"pageserver_io_time",
|
||||
"Time spent in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
@@ -43,8 +43,8 @@ lazy_static! {
|
||||
}
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
"pageserver_io_size",
|
||||
"Amount of bytes",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
@@ -21,10 +21,8 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{page_is_new, page_set_lsn};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
@@ -84,7 +82,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
|
||||
) -> Result<()> {
|
||||
let mut modification = timeline.begin_modification(lsn);
|
||||
|
||||
let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?;
|
||||
let mut decoded = decode_wal_record(recdata);
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
@@ -253,7 +251,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
|
||||
|
||||
// If checkpoint data was updated, store the new version in the repository
|
||||
if self.checkpoint_modified {
|
||||
let new_checkpoint_bytes = self.checkpoint.encode()?;
|
||||
let new_checkpoint_bytes = self.checkpoint.encode();
|
||||
|
||||
modification.put_checkpoint(new_checkpoint_bytes)?;
|
||||
self.checkpoint_modified = false;
|
||||
@@ -305,14 +303,8 @@ impl<'a, R: Repository> WalIngest<'a, R> {
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, lsn)
|
||||
}
|
||||
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
|
||||
self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
|
||||
} else {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
//!
|
||||
//! Functions for parsing WAL records.
|
||||
//!
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD};
|
||||
@@ -10,7 +9,6 @@ use postgres_ffi::{BlockNumber, OffsetNumber};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom zenith-specific "record".
|
||||
@@ -505,7 +503,7 @@ impl XlMultiXactTruncate {
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeError> {
|
||||
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
@@ -516,7 +514,7 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE
|
||||
// 1. Parse XLogRecord struct
|
||||
|
||||
// FIXME: assume little-endian here
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf)?;
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
trace!(
|
||||
"decode_wal_record xl_rmid = {} xl_info = {}",
|
||||
@@ -744,32 +742,34 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
Ok(DecodedWALRecord {
|
||||
DecodedWALRecord {
|
||||
xl_xid: xlogrec.xl_xid,
|
||||
xl_info: xlogrec.xl_info,
|
||||
xl_rmid: xlogrec.xl_rmid,
|
||||
record,
|
||||
blocks,
|
||||
main_data_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Build a human-readable string to describe a WAL record
|
||||
///
|
||||
/// For debugging purposes
|
||||
pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result<String, DeserializeError> {
|
||||
pub fn describe_wal_record(rec: &ZenithWalRecord) -> String {
|
||||
match rec {
|
||||
ZenithWalRecord::Postgres { will_init, rec } => Ok(format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)?
|
||||
)),
|
||||
_ => Ok(format!("{:?}", rec)),
|
||||
ZenithWalRecord::Postgres { will_init, rec } => {
|
||||
format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)
|
||||
)
|
||||
}
|
||||
_ => format!("{:?}", rec),
|
||||
}
|
||||
}
|
||||
|
||||
fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
|
||||
fn describe_postgres_wal_record(record: &Bytes) -> String {
|
||||
// TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this.
|
||||
// Maybe use the postgres wal redo process, the same used for replaying WAL records?
|
||||
// Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly,
|
||||
@@ -782,7 +782,7 @@ fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeErr
|
||||
// 1. Parse XLogRecord struct
|
||||
|
||||
// FIXME: assume little-endian here
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf)?;
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
let unknown_str: String;
|
||||
|
||||
@@ -830,5 +830,5 @@ fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeErr
|
||||
}
|
||||
};
|
||||
|
||||
Ok(String::from(result))
|
||||
String::from(result)
|
||||
}
|
||||
|
||||
@@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
// each tenant.
|
||||
lazy_static! {
|
||||
static ref WAL_REDO_TIME: Histogram =
|
||||
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
|
||||
register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"pageserver_wal_redo_wait_time",
|
||||
"Time spent waiting for access to the WAL redo process"
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
"pageserver_wal_records_replayed",
|
||||
"Number of WAL records replayed"
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
30
poetry.lock
generated
30
poetry.lock
generated
@@ -822,7 +822,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "moto"
|
||||
version = "3.1.7"
|
||||
version = "3.0.4"
|
||||
description = "A library that allows your python tests to easily mock out the boto library"
|
||||
category = "main"
|
||||
optional = false
|
||||
@@ -844,7 +844,6 @@ importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
|
||||
Jinja2 = ">=2.10.1"
|
||||
jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
|
||||
MarkupSafe = "!=2.0.0a1"
|
||||
pyparsing = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""}
|
||||
python-dateutil = ">=2.1,<3.0.0"
|
||||
python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
|
||||
pytz = "*"
|
||||
@@ -856,7 +855,7 @@ werkzeug = "*"
|
||||
xmltodict = "*"
|
||||
|
||||
[package.extras]
|
||||
all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools"]
|
||||
all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools"]
|
||||
apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"]
|
||||
apigatewayv2 = ["PyYAML (>=5.1)"]
|
||||
appsync = ["graphql-core"]
|
||||
@@ -865,16 +864,14 @@ batch = ["docker (>=2.5.1)"]
|
||||
cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"]
|
||||
cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"]
|
||||
ds = ["sshpubkeys (>=3.1.0)"]
|
||||
dynamodb = ["docker (>=2.5.1)"]
|
||||
dynamodb2 = ["docker (>=2.5.1)"]
|
||||
dynamodbstreams = ["docker (>=2.5.1)"]
|
||||
ec2 = ["sshpubkeys (>=3.1.0)"]
|
||||
efs = ["sshpubkeys (>=3.1.0)"]
|
||||
glue = ["pyparsing (>=3.0.0)"]
|
||||
iotdata = ["jsondiff (>=1.1.2)"]
|
||||
route53resolver = ["sshpubkeys (>=3.1.0)"]
|
||||
s3 = ["PyYAML (>=5.1)"]
|
||||
server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools", "flask", "flask-cors"]
|
||||
server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools", "flask", "flask-cors"]
|
||||
ssm = ["PyYAML (>=5.1)", "dataclasses"]
|
||||
xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
|
||||
|
||||
@@ -1071,17 +1068,6 @@ python-versions = ">=3.6"
|
||||
py = "*"
|
||||
pytest = ">=3.10"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-lazy-fixture"
|
||||
version = "0.6.3"
|
||||
description = "It helps to use fixtures in pytest.mark.parametrize"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=3.2.5"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-xdist"
|
||||
version = "2.5.0"
|
||||
@@ -1375,7 +1361,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.7"
|
||||
content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e"
|
||||
content-hash = "58762accad4122026c650fa43421a900546e89f9908e2268410e7b11cc8c6c4e"
|
||||
|
||||
[metadata.files]
|
||||
aiopg = [
|
||||
@@ -1693,8 +1679,8 @@ mccabe = [
|
||||
{file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
|
||||
]
|
||||
moto = [
|
||||
{file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"},
|
||||
{file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"},
|
||||
{file = "moto-3.0.4-py2.py3-none-any.whl", hash = "sha256:79646213d8438385182f4eea79e28725f94b3d0d3dc9a3eda81db47e0ebef6cc"},
|
||||
{file = "moto-3.0.4.tar.gz", hash = "sha256:168b8a3cb4dd8a6df8e51d582761cefa9657b9f45ac7e1eb24dae394ebc9e000"},
|
||||
]
|
||||
mypy = [
|
||||
{file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"},
|
||||
@@ -1869,10 +1855,6 @@ pytest-forked = [
|
||||
{file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"},
|
||||
{file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"},
|
||||
]
|
||||
pytest-lazy-fixture = [
|
||||
{file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
|
||||
{file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
|
||||
]
|
||||
pytest-xdist = [
|
||||
{file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"},
|
||||
{file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"},
|
||||
|
||||
@@ -33,7 +33,6 @@ tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-rustls = "0.23.0"
|
||||
url = "2.2.2"
|
||||
git-version = "0.3.5"
|
||||
|
||||
utils = { path = "../libs/utils" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
|
||||
@@ -117,7 +117,7 @@ async fn get_auth_info(
|
||||
let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?;
|
||||
|
||||
url.query_pairs_mut()
|
||||
.append_pair("project", cluster)
|
||||
.append_pair("cluster", cluster)
|
||||
.append_pair("role", user);
|
||||
|
||||
// TODO: use a proper logger
|
||||
@@ -141,7 +141,7 @@ async fn wake_compute(
|
||||
cluster: &str,
|
||||
) -> Result<(String, u16), ConsoleAuthError> {
|
||||
let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?;
|
||||
url.query_pairs_mut().append_pair("project", cluster);
|
||||
url.query_pairs_mut().append_pair("cluster", cluster);
|
||||
|
||||
// TODO: use a proper logger
|
||||
println!("cplane request: {}", url);
|
||||
|
||||
@@ -25,9 +25,7 @@ use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use std::{future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use utils::GIT_VERSION;
|
||||
|
||||
/// Flattens `Result<Result<T>>` into `Result<T>`.
|
||||
async fn flatten_err(
|
||||
@@ -126,7 +124,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
}));
|
||||
|
||||
println!("Version: {GIT_VERSION}");
|
||||
println!("Version: {}", GIT_VERSION);
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting http on {}", http_address);
|
||||
|
||||
@@ -22,7 +22,6 @@ boto3 = "^1.20.40"
|
||||
boto3-stubs = "^1.20.40"
|
||||
moto = {version = "^3.0.0", extras = ["server"]}
|
||||
backoff = "^1.11.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
yapf = "==0.31.0"
|
||||
|
||||
@@ -24,18 +24,18 @@ walkdir = "2"
|
||||
url = "2.2.2"
|
||||
signal-hook = "0.3.10"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
serde_with = {version = "1.12.0"}
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
etcd-client = "0.8.3"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
git-version = "0.3.5"
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -17,19 +17,16 @@ use url::{ParseError, Url};
|
||||
use safekeeper::control_file::{self};
|
||||
use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use safekeeper::remove_wal;
|
||||
use safekeeper::timeline::GlobalTimelines;
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use safekeeper::{broker, callmemaybe};
|
||||
use safekeeper::{http, s3_offload};
|
||||
use utils::{
|
||||
http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
|
||||
zid::ZNodeId,
|
||||
http::endpoint, logging, shutdown::exit_now, signals, tcp_listener, zid::ZNodeId, GIT_VERSION,
|
||||
};
|
||||
|
||||
const LOCK_FILE_NAME: &str = "safekeeper.lock";
|
||||
const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> Result<()> {
|
||||
metrics::set_common_metrics_prefix("safekeeper");
|
||||
@@ -112,20 +109,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("broker-etcd-prefix")
|
||||
.long("broker-etcd-prefix")
|
||||
.takes_value(true)
|
||||
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enable-s3-offload")
|
||||
.long("enable-s3-offload")
|
||||
.takes_value(true)
|
||||
.default_value("true")
|
||||
.default_missing_value("true")
|
||||
.help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("dump-control-file") {
|
||||
@@ -135,7 +118,7 @@ fn main() -> Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut conf = SafeKeeperConf::default();
|
||||
let mut conf: SafeKeeperConf = Default::default();
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
// change into the data directory.
|
||||
@@ -179,16 +162,6 @@ fn main() -> Result<()> {
|
||||
let collected_ep: Result<Vec<Url>, ParseError> = addr.split(',').map(Url::parse).collect();
|
||||
conf.broker_endpoints = Some(collected_ep?);
|
||||
}
|
||||
if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") {
|
||||
conf.broker_etcd_prefix = prefix.to_string();
|
||||
}
|
||||
|
||||
// Seems like there is no better way to accept bool values explicitly in clap.
|
||||
conf.s3_offload_enabled = arg_matches
|
||||
.value_of("enable-s3-offload")
|
||||
.unwrap()
|
||||
.parse()
|
||||
.context("failed to parse bool enable-s3-offload bool")?;
|
||||
|
||||
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
||||
}
|
||||
@@ -196,7 +169,7 @@ fn main() -> Result<()> {
|
||||
fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: bool) -> Result<()> {
|
||||
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
||||
|
||||
info!("version: {GIT_VERSION}");
|
||||
info!("version: {}", GIT_VERSION);
|
||||
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(LOCK_FILE_NAME);
|
||||
@@ -252,8 +225,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel();
|
||||
GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx);
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
@@ -282,12 +253,13 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
||||
);
|
||||
}
|
||||
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let conf_cloned = conf.clone();
|
||||
let safekeeper_thread = thread::Builder::new()
|
||||
.name("Safekeeper thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = wal_service::thread_main(conf_cloned, pg_listener);
|
||||
let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx);
|
||||
if let Err(e) = thread_result {
|
||||
info!("safekeeper thread terminated: {}", e);
|
||||
}
|
||||
@@ -301,7 +273,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
||||
.name("callmemaybe thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx);
|
||||
let thread_result = callmemaybe::thread_main(conf_cloned, rx);
|
||||
if let Err(e) = thread_result {
|
||||
error!("callmemaybe thread terminated: {}", e);
|
||||
}
|
||||
|
||||
@@ -1,22 +1,61 @@
|
||||
//! Communication with etcd, providing safekeeper peers and pageserver coordination.
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
use etcd_broker::Client;
|
||||
use etcd_broker::PutOptions;
|
||||
use etcd_broker::SkTimelineSubscriptionKind;
|
||||
use etcd_client::Client;
|
||||
use etcd_client::EventType;
|
||||
use etcd_client::PutOptions;
|
||||
use etcd_client::WatchOptions;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{runtime, time::sleep};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{timeline::GlobalTimelines, SafeKeeperConf};
|
||||
use utils::zid::{ZNodeId, ZTenantTimelineId};
|
||||
use crate::{safekeeper::Term, timeline::GlobalTimelines, SafeKeeperConf};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
const LEASE_TTL_SEC: i64 = 5;
|
||||
// TODO: add global zenith installation ID.
|
||||
const ZENITH_PREFIX: &str = "zenith";
|
||||
|
||||
/// Published data about safekeeper. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SafekeeperInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<Term>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub s3_wal_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
@@ -32,21 +71,22 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
||||
});
|
||||
}
|
||||
|
||||
/// Key to per timeline per safekeeper data.
|
||||
fn timeline_safekeeper_path(
|
||||
broker_prefix: String,
|
||||
zttid: ZTenantTimelineId,
|
||||
sk_id: ZNodeId,
|
||||
) -> String {
|
||||
/// Prefix to timeline related data.
|
||||
fn timeline_path(zttid: &ZTenantTimelineId) -> String {
|
||||
format!(
|
||||
"{}/{sk_id}",
|
||||
SkTimelineSubscriptionKind::timeline(broker_prefix, zttid).watch_key()
|
||||
"{}/{}/{}",
|
||||
ZENITH_PREFIX, zttid.tenant_id, zttid.timeline_id
|
||||
)
|
||||
}
|
||||
|
||||
/// Key to per timeline per safekeeper data.
|
||||
fn timeline_safekeeper_path(zttid: &ZTenantTimelineId, sk_id: ZNodeId) -> String {
|
||||
format!("{}/safekeeper/{}", timeline_path(zttid), sk_id)
|
||||
}
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?;
|
||||
async fn push_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?;
|
||||
|
||||
// Get and maintain lease to automatically delete obsolete data
|
||||
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
||||
@@ -58,17 +98,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
// is under plain mutex. That's ok, all this code is not performance
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
for zttid in GlobalTimelines::get_active_timelines() {
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
||||
let sk_info = tli.get_public_info(&conf)?;
|
||||
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||
for zttid in &active_tlis {
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
|
||||
let sk_info = tli.get_public_info();
|
||||
let put_opts = PutOptions::new().with_lease(lease.id());
|
||||
client
|
||||
.put(
|
||||
timeline_safekeeper_path(
|
||||
conf.broker_etcd_prefix.clone(),
|
||||
zttid,
|
||||
conf.my_id,
|
||||
),
|
||||
timeline_safekeeper_path(zttid, conf.my_id),
|
||||
serde_json::to_string(&sk_info)?,
|
||||
Some(put_opts),
|
||||
)
|
||||
@@ -91,31 +128,45 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?;
|
||||
|
||||
let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates(
|
||||
&mut client,
|
||||
SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to subscribe for safekeeper info")?;
|
||||
|
||||
lazy_static! {
|
||||
static ref TIMELINE_SAFEKEEPER_RE: Regex =
|
||||
Regex::new(r"^zenith/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$")
|
||||
.unwrap();
|
||||
}
|
||||
let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?;
|
||||
loop {
|
||||
match subscription.fetch_data().await {
|
||||
Some(new_info) => {
|
||||
for (zttid, sk_info) in new_info {
|
||||
// note: there are blocking operations below, but it's considered fine for now
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
||||
for (safekeeper_id, info) in sk_info {
|
||||
tli.record_safekeeper_info(&info, safekeeper_id)?
|
||||
let wo = WatchOptions::new().with_prefix();
|
||||
// TODO: subscribe only to my timelines
|
||||
let (_, mut stream) = client.watch(ZENITH_PREFIX, Some(wo)).await?;
|
||||
while let Some(resp) = stream.message().await? {
|
||||
if resp.canceled() {
|
||||
bail!("watch canceled");
|
||||
}
|
||||
|
||||
for event in resp.events() {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(kv) = event.kv() {
|
||||
if let Some(caps) = TIMELINE_SAFEKEEPER_RE.captures(kv.key_str()?) {
|
||||
let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let zttid = ZTenantTimelineId::new(tenant_id, timeline_id);
|
||||
let safekeeper_id = ZNodeId(caps.get(3).unwrap().as_str().parse()?);
|
||||
let value_str = kv.value_str()?;
|
||||
match serde_json::from_str::<SafekeeperInfo>(value_str) {
|
||||
Ok(safekeeper_info) => {
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
||||
tli.record_safekeeper_info(&safekeeper_info, safekeeper_id)?
|
||||
}
|
||||
}
|
||||
Err(err) => warn!(
|
||||
"failed to deserialize safekeeper info {}: {}",
|
||||
value_str, err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
debug!("timeline updates sender closed, aborting the pull loop");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,43 +103,6 @@ pub struct SafeKeeperStateV3 {
|
||||
pub wal_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SafeKeeperStateV4 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: ZTenantId,
|
||||
/// Zenith timelineid
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: ZTimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealed with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Part of WAL acknowledged by quorum and available locally. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// First LSN not yet offloaded to s3. Useful to persist to avoid finding
|
||||
/// out offloading progress on boot.
|
||||
pub s3_wal_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
// Peers and their state as we remember it. Knowing peers themselves is
|
||||
// fundamental; but state is saved here only for informational purposes and
|
||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
// place to have less file version upgrades).
|
||||
pub peers: Peers,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -162,8 +125,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
wal_seg_size: oldstate.server.wal_seg_size,
|
||||
},
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn: Lsn(0),
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
s3_wal_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
@@ -185,8 +146,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn: Lsn(0),
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
s3_wal_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
@@ -208,37 +167,12 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn: Lsn(0),
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
s3_wal_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
let oldstate = SafeKeeperStateV4::des(&buf[..buf.len()])?;
|
||||
let server = ServerInfo {
|
||||
pg_version: oldstate.server.pg_version,
|
||||
system_id: oldstate.server.system_id,
|
||||
wal_seg_size: oldstate.server.wal_seg_size,
|
||||
};
|
||||
return Ok(SafeKeeperState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn: Lsn(0),
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
s3_wal_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
});
|
||||
}
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
@@ -21,6 +21,9 @@ use utils::{
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
|
||||
/// Safekeeper handler of postgres commands
|
||||
pub struct SafekeeperPostgresHandler {
|
||||
pub conf: SafeKeeperConf,
|
||||
@@ -30,6 +33,8 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub ztimelineid: Option<ZTimelineId>,
|
||||
pub timeline: Option<Arc<Timeline>>,
|
||||
pageserver_connstr: Option<String>,
|
||||
//sender to communicate with callmemaybe thread
|
||||
pub tx: UnboundedSender<CallmeEvent>,
|
||||
}
|
||||
|
||||
/// Parsed Postgres command.
|
||||
@@ -135,7 +140,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
}
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(conf: SafeKeeperConf) -> Self {
|
||||
pub fn new(conf: SafeKeeperConf, tx: UnboundedSender<CallmeEvent>) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
appname: None,
|
||||
@@ -143,6 +148,7 @@ impl SafekeeperPostgresHandler {
|
||||
ztimelineid: None,
|
||||
timeline: None,
|
||||
pageserver_connstr: None,
|
||||
tx,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,22 +1,21 @@
|
||||
use etcd_broker::SkTimelineInfo;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::Serializer;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::broker::SafekeeperInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::safekeeper::TermHistory;
|
||||
use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult};
|
||||
use crate::timeline::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint,
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
request::{ensure_no_body, parse_request_param},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
lsn::Lsn,
|
||||
@@ -70,10 +69,6 @@ struct TimelineStatus {
|
||||
timeline_id: ZTimelineId,
|
||||
acceptor_state: AcceptorStateStatus,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
timeline_start_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
local_start_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
commit_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
s3_wal_lsn: Lsn,
|
||||
@@ -107,8 +102,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
tenant_id: zttid.tenant_id,
|
||||
timeline_id: zttid.timeline_id,
|
||||
acceptor_state: acc_state,
|
||||
timeline_start_lsn: state.timeline_start_lsn,
|
||||
local_start_lsn: state.local_start_lsn,
|
||||
commit_lsn: inmem.commit_lsn,
|
||||
s3_wal_lsn: inmem.s3_wal_lsn,
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
@@ -131,51 +124,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
json_response(StatusCode::CREATED, ())
|
||||
}
|
||||
|
||||
/// Deactivates the timeline and removes its data directory.
|
||||
///
|
||||
/// It does not try to stop any processing of the timeline; there is no such code at the time of writing.
|
||||
/// However, it tries to check whether the timeline was active and report it to caller just in case.
|
||||
/// Note that this information is inaccurate:
|
||||
/// 1. There is a race condition between checking the timeline for activity and actual directory deletion.
|
||||
/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing.
|
||||
async fn timeline_delete_force_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let zttid = ZTenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
ensure_no_body(&mut request).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
GlobalTimelines::delete_force(get_conf(&request), &zttid).map_err(ApiError::from_err)?,
|
||||
)
|
||||
}
|
||||
|
||||
/// Deactivates all timelines for the tenant and removes its data directory.
|
||||
/// See `timeline_delete_force_handler`.
|
||||
async fn tenant_delete_force_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id)
|
||||
.map_err(ApiError::from_err)?
|
||||
.iter()
|
||||
.map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp))
|
||||
.collect::<HashMap<String, TimelineDeleteForceResult>>(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Used only in tests to hand craft required data.
|
||||
async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let zttid = ZTenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
|
||||
let safekeeper_info: SafekeeperInfo = json_request(&mut request).await?;
|
||||
|
||||
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
||||
tli.record_safekeeper_info(&safekeeper_info, ZNodeId(1))?;
|
||||
@@ -194,11 +149,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
timeline_status_handler,
|
||||
)
|
||||
.post("/v1/timeline", timeline_create_handler)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_force_handler,
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
|
||||
// for tests
|
||||
.post(
|
||||
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",
|
||||
|
||||
@@ -95,7 +95,7 @@ pub fn handle_json_ctrl(
|
||||
/// by sending ProposerGreeting with default server.wal_seg_size.
|
||||
fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> {
|
||||
let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting {
|
||||
protocol_version: 2, // current protocol
|
||||
protocol_version: 1, // current protocol
|
||||
pg_version: 0, // unknown
|
||||
proposer_id: [0u8; 16],
|
||||
system_id: 0,
|
||||
@@ -124,7 +124,6 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L
|
||||
term,
|
||||
start_streaming_at: lsn,
|
||||
term_history: history,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
});
|
||||
|
||||
spg.timeline.get().process_msg(&proposer_elected_request)?;
|
||||
@@ -239,13 +238,13 @@ fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
xl_crc: 0, // crc will be calculated later
|
||||
};
|
||||
|
||||
let header_bytes = header.encode().expect("failed to encode header");
|
||||
let header_bytes = header.encode();
|
||||
let crc = crc32c_append(0, &data);
|
||||
let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
let mut wal: Vec<u8> = Vec::new();
|
||||
wal.extend_from_slice(&header.encode().expect("failed to encode header"));
|
||||
wal.extend_from_slice(&header.encode());
|
||||
wal.extend_from_slice(&data);
|
||||
|
||||
// WAL start position must be aligned at 8 bytes,
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use url::Url;
|
||||
|
||||
use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId};
|
||||
use utils::zid::{ZNodeId, ZTenantTimelineId};
|
||||
|
||||
pub mod broker;
|
||||
pub mod callmemaybe;
|
||||
@@ -27,11 +27,10 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_NEON_BROKER_PREFIX: &str = "neon";
|
||||
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(1);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -52,17 +51,12 @@ pub struct SafeKeeperConf {
|
||||
pub recall_period: Duration,
|
||||
pub my_id: ZNodeId,
|
||||
pub broker_endpoints: Option<Vec<Url>>,
|
||||
pub broker_etcd_prefix: String,
|
||||
pub s3_offload_enabled: bool,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf {
|
||||
self.workdir.join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf {
|
||||
self.tenant_dir(&zttid.tenant_id)
|
||||
self.workdir
|
||||
.join(zttid.tenant_id.to_string())
|
||||
.join(zttid.timeline_id.to_string())
|
||||
}
|
||||
}
|
||||
@@ -82,8 +76,6 @@ impl Default for SafeKeeperConf {
|
||||
recall_period: defaults::DEFAULT_RECALL_PERIOD,
|
||||
my_id: ZNodeId(0),
|
||||
broker_endpoints: None,
|
||||
broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(),
|
||||
s3_offload_enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
|
||||
use bytes::BytesMut;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
use tracing::*;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
@@ -27,6 +28,8 @@ use utils::{
|
||||
sock_split::ReadStream,
|
||||
};
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
|
||||
pub struct ReceiveWalConn<'pg> {
|
||||
/// Postgres connection
|
||||
pg_backend: &'pg mut PostgresBackend,
|
||||
@@ -88,9 +91,10 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
// Register the connection and defer unregister.
|
||||
spg.timeline
|
||||
.get()
|
||||
.on_compute_connect(self.pageserver_connstr.as_ref())?;
|
||||
.on_compute_connect(self.pageserver_connstr.as_ref(), &spg.tx)?;
|
||||
let _guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(spg.timeline.get()),
|
||||
callmemaybe_tx: spg.tx.clone(),
|
||||
};
|
||||
|
||||
let mut next_msg = Some(next_msg);
|
||||
@@ -190,10 +194,13 @@ impl ProposerPollStream {
|
||||
|
||||
struct ComputeConnectionGuard {
|
||||
timeline: Arc<Timeline>,
|
||||
callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||
}
|
||||
|
||||
impl Drop for ComputeConnectionGuard {
|
||||
fn drop(&mut self) {
|
||||
self.timeline.on_compute_disconnect().unwrap();
|
||||
self.timeline
|
||||
.on_compute_disconnect(&self.callmemaybe_tx)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||
for zttid in &active_tlis {
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
|
||||
if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) {
|
||||
if let Err(e) = tli.remove_old_wal() {
|
||||
warn!(
|
||||
"failed to remove WAL for tenant {} timeline {}: {}",
|
||||
tli.zttid.tenant_id, tli.zttid.timeline_id, e
|
||||
|
||||
@@ -1,23 +1,20 @@
|
||||
//
|
||||
// Offload old WAL segments to S3 and remove them locally
|
||||
// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set
|
||||
// if no IAM bucket access is used.
|
||||
//
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::Context;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey,
|
||||
};
|
||||
use rusoto_core::credential::StaticProvider;
|
||||
use rusoto_core::{HttpClient, Region};
|
||||
use rusoto_s3::{ListObjectsV2Request, PutObjectRequest, S3Client, StreamingBody, S3};
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::io::BufReader;
|
||||
use tokio::runtime;
|
||||
use tokio::time::sleep;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
@@ -42,8 +39,9 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
||||
}
|
||||
|
||||
async fn offload_files(
|
||||
remote_storage: &S3Bucket,
|
||||
listing: &HashSet<S3ObjectKey>,
|
||||
client: &S3Client,
|
||||
bucket_name: &str,
|
||||
listing: &HashSet<String>,
|
||||
dir_path: &Path,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<u64> {
|
||||
@@ -57,12 +55,17 @@ async fn offload_files(
|
||||
&& IsXLogFileName(entry.file_name().to_str().unwrap())
|
||||
&& entry.metadata().unwrap().created().unwrap() <= horizon
|
||||
{
|
||||
let remote_path = remote_storage.remote_object_id(path)?;
|
||||
if !listing.contains(&remote_path) {
|
||||
let relpath = path.strip_prefix(&conf.workdir).unwrap();
|
||||
let s3path = String::from("walarchive/") + relpath.to_str().unwrap();
|
||||
if !listing.contains(&s3path) {
|
||||
let file = File::open(&path).await?;
|
||||
let file_length = file.metadata().await?.len() as usize;
|
||||
remote_storage
|
||||
.upload(BufReader::new(file), file_length, &remote_path, None)
|
||||
client
|
||||
.put_object(PutObjectRequest {
|
||||
body: Some(StreamingBody::new(ReaderStream::new(file))),
|
||||
bucket: bucket_name.to_string(),
|
||||
key: s3path,
|
||||
..PutObjectRequest::default()
|
||||
})
|
||||
.await?;
|
||||
|
||||
fs::remove_file(&path).await?;
|
||||
@@ -74,34 +77,58 @@ async fn offload_files(
|
||||
}
|
||||
|
||||
async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let remote_storage = match GenericRemoteStorage::new(
|
||||
conf.workdir.clone(),
|
||||
&RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(10).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(1).unwrap(),
|
||||
storage: remote_storage::RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: "zenith-testbucket".to_string(),
|
||||
bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?,
|
||||
prefix_in_bucket: Some("walarchive/".to_string()),
|
||||
endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?),
|
||||
concurrency_limit: NonZeroUsize::new(20).unwrap(),
|
||||
}),
|
||||
},
|
||||
)? {
|
||||
GenericRemoteStorage::Local(_) => {
|
||||
bail!("Unexpected: got local storage for the remote config")
|
||||
}
|
||||
GenericRemoteStorage::S3(remote_storage) => remote_storage,
|
||||
let region = Region::Custom {
|
||||
name: env::var("S3_REGION").context("S3_REGION env var is not set")?,
|
||||
endpoint: env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?,
|
||||
};
|
||||
|
||||
let client = S3Client::new_with(
|
||||
HttpClient::new().context("Failed to create S3 http client")?,
|
||||
StaticProvider::new_minimal(
|
||||
env::var("S3_ACCESSKEY").context("S3_ACCESSKEY env var is not set")?,
|
||||
env::var("S3_SECRET").context("S3_SECRET env var is not set")?,
|
||||
),
|
||||
region,
|
||||
);
|
||||
|
||||
let bucket_name = "zenith-testbucket";
|
||||
|
||||
loop {
|
||||
let listing = remote_storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?;
|
||||
info!("Offload {n} files to S3");
|
||||
let listing = gather_wal_entries(&client, bucket_name).await?;
|
||||
let n = offload_files(&client, bucket_name, &listing, &conf.workdir, conf).await?;
|
||||
info!("Offload {} files to S3", n);
|
||||
sleep(conf.ttl.unwrap()).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn gather_wal_entries(
|
||||
client: &S3Client,
|
||||
bucket_name: &str,
|
||||
) -> anyhow::Result<HashSet<String>> {
|
||||
let mut document_keys = HashSet::new();
|
||||
|
||||
let mut continuation_token = None::<String>;
|
||||
loop {
|
||||
let response = client
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
bucket: bucket_name.to_string(),
|
||||
prefix: Some("walarchive/".to_string()),
|
||||
continuation_token,
|
||||
..ListObjectsV2Request::default()
|
||||
})
|
||||
.await?;
|
||||
document_keys.extend(
|
||||
response
|
||||
.contents
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| o.key),
|
||||
);
|
||||
|
||||
continuation_token = response.continuation_token;
|
||||
if continuation_token.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use anyhow::{bail, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
use etcd_broker::SkTimelineInfo;
|
||||
use postgres_ffi::xlog_utils::TimeLineID;
|
||||
|
||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||
@@ -17,6 +16,7 @@ use tracing::*;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::broker::SafekeeperInfo;
|
||||
use crate::control_file;
|
||||
use crate::send_wal::HotStandbyFeedback;
|
||||
use crate::wal_storage;
|
||||
@@ -30,8 +30,8 @@ use utils::{
|
||||
};
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 5;
|
||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const SK_FORMAT_VERSION: u32 = 4;
|
||||
const SK_PROTOCOL_VERSION: u32 = 1;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
/// Consensus logical timestamp.
|
||||
@@ -52,7 +52,7 @@ impl TermHistory {
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermSwitchEntry pairs
|
||||
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
pub fn from_bytes(mut bytes: Bytes) -> Result<TermHistory> {
|
||||
if bytes.remaining() < 4 {
|
||||
bail!("TermHistory misses len");
|
||||
}
|
||||
@@ -183,13 +183,6 @@ pub struct SafeKeeperState {
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum and available locally. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
@@ -238,8 +231,6 @@ impl SafeKeeperState {
|
||||
wal_seg_size: 0,
|
||||
},
|
||||
proposer_uuid: [0; 16],
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
s3_wal_lsn: Lsn(0),
|
||||
peer_horizon_lsn: Lsn(0),
|
||||
@@ -277,7 +268,6 @@ pub struct ProposerGreeting {
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct AcceptorGreeting {
|
||||
term: u64,
|
||||
node_id: ZNodeId,
|
||||
}
|
||||
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
@@ -296,7 +286,6 @@ pub struct VoteResponse {
|
||||
flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
term_history: TermHistory,
|
||||
timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -308,7 +297,6 @@ pub struct ProposerElected {
|
||||
pub term: Term,
|
||||
pub start_streaming_at: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
pub timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
@@ -399,15 +387,10 @@ impl ProposerAcceptorMessage {
|
||||
}
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
|
||||
if msg_bytes.remaining() < 8 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let timeline_start_lsn = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes(msg_bytes)?;
|
||||
let msg = ProposerElected {
|
||||
term,
|
||||
start_streaming_at,
|
||||
timeline_start_lsn,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
@@ -454,7 +437,6 @@ impl AcceptorProposerMessage {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.node_id.0);
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u64_le('v' as u64);
|
||||
@@ -467,7 +449,6 @@ impl AcceptorProposerMessage {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
buf.put_u64_le(msg.timeline_start_lsn.into());
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u64_le('a' as u64);
|
||||
@@ -530,8 +511,6 @@ pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
|
||||
pub state: CTRL, // persistent state storage
|
||||
|
||||
pub wal_store: WAL,
|
||||
|
||||
node_id: ZNodeId, // safekeeper's node id
|
||||
}
|
||||
|
||||
impl<CTRL, WAL> SafeKeeper<CTRL, WAL>
|
||||
@@ -544,7 +523,6 @@ where
|
||||
ztli: ZTimelineId,
|
||||
state: CTRL,
|
||||
mut wal_store: WAL,
|
||||
node_id: ZNodeId,
|
||||
) -> Result<SafeKeeper<CTRL, WAL>> {
|
||||
if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id {
|
||||
bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id);
|
||||
@@ -566,7 +544,6 @@ where
|
||||
},
|
||||
state,
|
||||
wal_store,
|
||||
node_id,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -658,7 +635,6 @@ where
|
||||
);
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
|
||||
term: self.state.acceptor_state.term,
|
||||
node_id: self.node_id,
|
||||
})))
|
||||
}
|
||||
|
||||
@@ -674,7 +650,6 @@ where
|
||||
flush_lsn: self.wal_store.flush_lsn(),
|
||||
truncate_lsn: self.state.peer_horizon_lsn,
|
||||
term_history: self.get_term_history(),
|
||||
timeline_start_lsn: self.state.timeline_start_lsn,
|
||||
};
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.clone();
|
||||
@@ -730,23 +705,6 @@ where
|
||||
// and now adopt term history from proposer
|
||||
{
|
||||
let mut state = self.state.clone();
|
||||
|
||||
// Remeber point where WAL begins globally, if not yet.
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
state.timeline_start_lsn = msg.timeline_start_lsn;
|
||||
info!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// Remember point where WAL begins locally, if not yet. (I doubt the
|
||||
// second condition is ever possible)
|
||||
if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at {
|
||||
state.local_start_lsn = msg.start_streaming_at;
|
||||
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
||||
}
|
||||
|
||||
state.acceptor_state.term_history = msg.term_history.clone();
|
||||
self.state.persist(&state)?;
|
||||
}
|
||||
@@ -886,7 +844,7 @@ where
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> {
|
||||
pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperInfo) -> Result<()> {
|
||||
let mut sync_control_file = false;
|
||||
if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term)
|
||||
{
|
||||
@@ -930,20 +888,17 @@ where
|
||||
/// offloading.
|
||||
/// While it is safe to use inmem values for determining horizon,
|
||||
/// we use persistent to make possible normal states less surprising.
|
||||
pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo {
|
||||
let s3_offload_horizon = if s3_offload_enabled {
|
||||
self.state.s3_wal_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
pub fn get_horizon_segno(&self) -> XLogSegNo {
|
||||
let horizon_lsn = min(
|
||||
min(
|
||||
self.state.remote_consistent_lsn,
|
||||
self.state.peer_horizon_lsn,
|
||||
),
|
||||
s3_offload_horizon,
|
||||
self.state.s3_wal_lsn,
|
||||
);
|
||||
horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
|
||||
let res = horizon_lsn.segment_number(self.state.server.wal_seg_size as usize);
|
||||
info!("horizon is {}, res {}", horizon_lsn, res);
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1013,7 +968,7 @@ mod tests {
|
||||
};
|
||||
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
||||
let ztli = ZTimelineId::from([0u8; 16]);
|
||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
|
||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap();
|
||||
|
||||
// check voting for 1 is ok
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
||||
@@ -1028,7 +983,7 @@ mod tests {
|
||||
let storage = InMemoryState {
|
||||
persisted_state: state,
|
||||
};
|
||||
sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap();
|
||||
sk = SafeKeeper::new(ztli, storage, sk.wal_store).unwrap();
|
||||
|
||||
// and ensure voting second time for 1 is not ok
|
||||
vote_resp = sk.process_msg(&vote_request);
|
||||
@@ -1045,7 +1000,7 @@ mod tests {
|
||||
};
|
||||
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
||||
let ztli = ZTimelineId::from([0u8; 16]);
|
||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
|
||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap();
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
@@ -1068,7 +1023,6 @@ mod tests {
|
||||
term: 1,
|
||||
lsn: Lsn(3),
|
||||
}]),
|
||||
timeline_start_lsn: Lsn(0),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.unwrap();
|
||||
|
||||
@@ -264,13 +264,13 @@ impl ReplicationConn {
|
||||
} else {
|
||||
let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery");
|
||||
let zttid = spg.timeline.get().zttid;
|
||||
let tx_clone = spg.timeline.get().callmemaybe_tx.clone();
|
||||
let tx_clone = spg.tx.clone();
|
||||
let subscription_key = SubscriptionStateKey::new(
|
||||
zttid.tenant_id,
|
||||
zttid.timeline_id,
|
||||
pageserver_connstr.clone(),
|
||||
);
|
||||
tx_clone
|
||||
spg.tx
|
||||
.send(CallmeEvent::Pause(subscription_key))
|
||||
.unwrap_or_else(|e| {
|
||||
error!("failed to send Pause request to callmemaybe thread {}", e);
|
||||
@@ -315,7 +315,7 @@ impl ReplicationConn {
|
||||
} else {
|
||||
// TODO: also check once in a while whether we are walsender
|
||||
// to right pageserver.
|
||||
if spg.timeline.get().check_deactivate(replica_id)? {
|
||||
if spg.timeline.get().check_deactivate(replica_id, &spg.tx)? {
|
||||
// Shut down, timeline is suspended.
|
||||
// TODO create proper error type for this
|
||||
bail!("end streaming to {:?}", spg.appname);
|
||||
|
||||
@@ -3,12 +3,9 @@
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
|
||||
use etcd_broker::SkTimelineInfo;
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self};
|
||||
@@ -21,9 +18,10 @@ use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
pq_proto::ZenithFeedback,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
||||
zid::{ZNodeId, ZTenantTimelineId},
|
||||
};
|
||||
|
||||
use crate::broker::SafekeeperInfo;
|
||||
use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey};
|
||||
|
||||
use crate::control_file;
|
||||
@@ -104,7 +102,7 @@ impl SharedState {
|
||||
let state = SafeKeeperState::new(zttid, peer_ids);
|
||||
let control_store = control_file::FileStorage::create_new(zttid, conf, state)?;
|
||||
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf);
|
||||
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?;
|
||||
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?;
|
||||
|
||||
Ok(Self {
|
||||
notified_commit_lsn: Lsn(0),
|
||||
@@ -127,7 +125,7 @@ impl SharedState {
|
||||
|
||||
Ok(Self {
|
||||
notified_commit_lsn: Lsn(0),
|
||||
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?,
|
||||
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?,
|
||||
replicas: Vec::new(),
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
@@ -277,21 +275,15 @@ impl SharedState {
|
||||
/// Database instance (tenant)
|
||||
pub struct Timeline {
|
||||
pub zttid: ZTenantTimelineId,
|
||||
pub callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||
mutex: Mutex<SharedState>,
|
||||
/// conditional variable used to notify wal senders
|
||||
cond: Condvar,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
fn new(
|
||||
zttid: ZTenantTimelineId,
|
||||
callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||
shared_state: SharedState,
|
||||
) -> Timeline {
|
||||
fn new(zttid: ZTenantTimelineId, shared_state: SharedState) -> Timeline {
|
||||
Timeline {
|
||||
zttid,
|
||||
callmemaybe_tx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
cond: Condvar::new(),
|
||||
}
|
||||
@@ -300,27 +292,34 @@ impl Timeline {
|
||||
/// Register compute connection, starting timeline-related activity if it is
|
||||
/// not running yet.
|
||||
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> {
|
||||
pub fn on_compute_connect(
|
||||
&self,
|
||||
pageserver_connstr: Option<&String>,
|
||||
callmemaybe_tx: &UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.num_computes += 1;
|
||||
// FIXME: currently we always adopt latest pageserver connstr, but we
|
||||
// should have kind of generations assigned by compute to distinguish
|
||||
// the latest one or even pass it through consensus to reliably deliver
|
||||
// to all safekeepers.
|
||||
shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?;
|
||||
shared_state.activate(&self.zttid, pageserver_connstr, callmemaybe_tx)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// De-register compute connection, shutting down timeline activity if
|
||||
/// pageserver doesn't need catchup.
|
||||
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
pub fn on_compute_disconnect(&self) -> Result<()> {
|
||||
pub fn on_compute_disconnect(
|
||||
&self,
|
||||
callmemaybe_tx: &UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.num_computes -= 1;
|
||||
// If there is no pageserver, can suspend right away; otherwise let
|
||||
// walsender do that.
|
||||
if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() {
|
||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
||||
shared_state.deactivate(&self.zttid, callmemaybe_tx)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -328,7 +327,11 @@ impl Timeline {
|
||||
/// Deactivate tenant if there is no computes and pageserver is caughtup,
|
||||
/// assuming the pageserver status is in replica_id.
|
||||
/// Returns true if deactivated.
|
||||
pub fn check_deactivate(&self, replica_id: usize) -> Result<bool> {
|
||||
pub fn check_deactivate(
|
||||
&self,
|
||||
replica_id: usize,
|
||||
callmemaybe_tx: &UnboundedSender<CallmeEvent>,
|
||||
) -> Result<bool> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
if !shared_state.active {
|
||||
// already suspended
|
||||
@@ -340,27 +343,13 @@ impl Timeline {
|
||||
(replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
||||
replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn);
|
||||
if deactivate {
|
||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
||||
shared_state.deactivate(&self.zttid, callmemaybe_tx)?;
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Deactivates the timeline, assuming it is being deleted.
|
||||
/// Returns whether the timeline was already active.
|
||||
///
|
||||
/// The callmemaybe thread is stopped by the deactivation message. We assume all other threads
|
||||
/// will stop by themselves eventually (possibly with errors, but no panics). There should be no
|
||||
/// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but
|
||||
/// we're deleting the timeline anyway.
|
||||
pub fn deactivate_for_delete(&self) -> Result<bool> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
let was_active = shared_state.active;
|
||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
||||
Ok(was_active)
|
||||
}
|
||||
|
||||
fn is_active(&self) -> bool {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.active
|
||||
@@ -429,9 +418,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Prepare public safekeeper info for reporting.
|
||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
|
||||
pub fn get_public_info(&self) -> SafekeeperInfo {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
Ok(SkTimelineInfo {
|
||||
SafekeeperInfo {
|
||||
last_log_term: Some(shared_state.sk.get_epoch()),
|
||||
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
@@ -443,12 +432,11 @@ impl Timeline {
|
||||
shared_state.sk.inmem.remote_consistent_lsn,
|
||||
)),
|
||||
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
||||
safekeeper_connection_string: Some(conf.listen_pg_addr.clone()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> {
|
||||
pub fn record_safekeeper_info(&self, sk_info: &SafekeeperInfo, _sk_id: ZNodeId) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.sk.record_safekeeper_info(sk_info)?;
|
||||
self.notify_wal_senders(&mut shared_state);
|
||||
@@ -476,16 +464,12 @@ impl Timeline {
|
||||
shared_state.sk.wal_store.flush_lsn()
|
||||
}
|
||||
|
||||
pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> {
|
||||
pub fn remove_old_wal(&self) -> Result<()> {
|
||||
let horizon_segno: XLogSegNo;
|
||||
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
|
||||
{
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
// WAL seg size not initialized yet, no WAL exists.
|
||||
if shared_state.sk.state.server.wal_seg_size == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled);
|
||||
horizon_segno = shared_state.sk.get_horizon_segno();
|
||||
remover = shared_state.sk.wal_store.remove_up_to();
|
||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||
return Ok(());
|
||||
@@ -519,41 +503,22 @@ impl TimelineTools for Option<Arc<Timeline>> {
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalTimelinesState {
|
||||
timelines: HashMap<ZTenantTimelineId, Arc<Timeline>>,
|
||||
callmemaybe_tx: Option<UnboundedSender<CallmeEvent>>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
callmemaybe_tx: None
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Serialize)]
|
||||
pub struct TimelineDeleteForceResult {
|
||||
pub dir_existed: bool,
|
||||
pub was_active: bool,
|
||||
pub static ref TIMELINES: Mutex<HashMap<ZTenantTimelineId, Arc<Timeline>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
/// A zero-sized struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender<CallmeEvent>) {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(state.callmemaybe_tx.is_none());
|
||||
state.callmemaybe_tx = Some(callmemaybe_tx);
|
||||
}
|
||||
|
||||
fn create_internal(
|
||||
mut state: MutexGuard<GlobalTimelinesState>,
|
||||
mut timelines: MutexGuard<HashMap<ZTenantTimelineId, Arc<Timeline>>>,
|
||||
conf: &SafeKeeperConf,
|
||||
zttid: ZTenantTimelineId,
|
||||
peer_ids: Vec<ZNodeId>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
match state.timelines.get(&zttid) {
|
||||
match timelines.get(&zttid) {
|
||||
Some(_) => bail!("timeline {} already exists", zttid),
|
||||
None => {
|
||||
// TODO: check directory existence
|
||||
@@ -562,12 +527,8 @@ impl GlobalTimelines {
|
||||
let shared_state = SharedState::create(conf, &zttid, peer_ids)
|
||||
.context("failed to create shared state")?;
|
||||
|
||||
let new_tli = Arc::new(Timeline::new(
|
||||
zttid,
|
||||
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
||||
shared_state,
|
||||
));
|
||||
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
||||
let new_tli = Arc::new(Timeline::new(zttid, shared_state));
|
||||
timelines.insert(zttid, Arc::clone(&new_tli));
|
||||
Ok(new_tli)
|
||||
}
|
||||
}
|
||||
@@ -578,20 +539,20 @@ impl GlobalTimelines {
|
||||
zttid: ZTenantTimelineId,
|
||||
peer_ids: Vec<ZNodeId>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
GlobalTimelines::create_internal(state, conf, zttid, peer_ids)
|
||||
let timelines = TIMELINES.lock().unwrap();
|
||||
GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids)
|
||||
}
|
||||
|
||||
/// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map.
|
||||
/// Get a timeline with control file loaded from the global TIMELINES map.
|
||||
/// If control file doesn't exist and create=false, bails out.
|
||||
pub fn get(
|
||||
conf: &SafeKeeperConf,
|
||||
zttid: ZTenantTimelineId,
|
||||
create: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut timelines = TIMELINES.lock().unwrap();
|
||||
|
||||
match state.timelines.get(&zttid) {
|
||||
match timelines.get(&zttid) {
|
||||
Some(result) => Ok(Arc::clone(result)),
|
||||
None => {
|
||||
let shared_state =
|
||||
@@ -607,19 +568,20 @@ impl GlobalTimelines {
|
||||
.contains("No such file or directory")
|
||||
&& create
|
||||
{
|
||||
return GlobalTimelines::create_internal(state, conf, zttid, vec![]);
|
||||
return GlobalTimelines::create_internal(
|
||||
timelines,
|
||||
conf,
|
||||
zttid,
|
||||
vec![],
|
||||
);
|
||||
} else {
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let new_tli = Arc::new(Timeline::new(
|
||||
zttid,
|
||||
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
||||
shared_state,
|
||||
));
|
||||
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
||||
let new_tli = Arc::new(Timeline::new(zttid, shared_state));
|
||||
timelines.insert(zttid, Arc::clone(&new_tli));
|
||||
Ok(new_tli)
|
||||
}
|
||||
}
|
||||
@@ -627,86 +589,11 @@ impl GlobalTimelines {
|
||||
|
||||
/// Get ZTenantTimelineIDs of all active timelines.
|
||||
pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
state
|
||||
.timelines
|
||||
let timelines = TIMELINES.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.filter(|&(_, tli)| tli.is_active())
|
||||
.map(|(zttid, _)| *zttid)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn delete_force_internal(
|
||||
conf: &SafeKeeperConf,
|
||||
zttid: &ZTenantTimelineId,
|
||||
was_active: bool,
|
||||
) -> Result<TimelineDeleteForceResult> {
|
||||
match std::fs::remove_dir_all(conf.timeline_dir(zttid)) {
|
||||
Ok(_) => Ok(TimelineDeleteForceResult {
|
||||
dir_existed: true,
|
||||
was_active,
|
||||
}),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult {
|
||||
dir_existed: false,
|
||||
was_active,
|
||||
}),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes
|
||||
/// the corresponding data directory.
|
||||
/// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline
|
||||
/// anymore, and they will eventually terminate without panics.
|
||||
///
|
||||
/// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two
|
||||
/// `Timeline` objects in memory):
|
||||
/// a) a compute node connects after this method is called, or
|
||||
/// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or
|
||||
/// c) an HTTP POST request for timeline creation is made after the timeline is already deleted.
|
||||
/// TODO: ensure all of the above never happens.
|
||||
pub fn delete_force(
|
||||
conf: &SafeKeeperConf,
|
||||
zttid: &ZTenantTimelineId,
|
||||
) -> Result<TimelineDeleteForceResult> {
|
||||
info!("deleting timeline {}", zttid);
|
||||
let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) {
|
||||
None => false,
|
||||
Some(tli) => tli.deactivate_for_delete()?,
|
||||
};
|
||||
GlobalTimelines::delete_force_internal(conf, zttid, was_active)
|
||||
}
|
||||
|
||||
/// Deactivates and deletes all timelines for the tenant, see `delete()`.
|
||||
/// Returns map of all timelines which the tenant had, `true` if a timeline was active.
|
||||
pub fn delete_force_all_for_tenant(
|
||||
conf: &SafeKeeperConf,
|
||||
tenant_id: &ZTenantId,
|
||||
) -> Result<HashMap<ZTenantTimelineId, TimelineDeleteForceResult>> {
|
||||
info!("deleting all timelines for tenant {}", tenant_id);
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut deleted = HashMap::new();
|
||||
for (zttid, tli) in &state.timelines {
|
||||
if zttid.tenant_id == *tenant_id {
|
||||
deleted.insert(
|
||||
*zttid,
|
||||
GlobalTimelines::delete_force_internal(
|
||||
conf,
|
||||
zttid,
|
||||
tli.deactivate_for_delete()?,
|
||||
)?,
|
||||
);
|
||||
}
|
||||
}
|
||||
// TODO: test that the exact subset of timelines is removed.
|
||||
state
|
||||
.timelines
|
||||
.retain(|zttid, _| !deleted.contains_key(zttid));
|
||||
match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) {
|
||||
Ok(_) => (),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
||||
e => e?,
|
||||
};
|
||||
Ok(deleted)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,22 +8,29 @@ use std::net::{TcpListener, TcpStream};
|
||||
use std::thread;
|
||||
use tracing::*;
|
||||
|
||||
use crate::callmemaybe::CallmeEvent;
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::SafeKeeperConf;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
use utils::postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
|
||||
pub fn thread_main(
|
||||
conf: SafeKeeperConf,
|
||||
listener: TcpListener,
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
match listener.accept() {
|
||||
Ok((socket, peer_addr)) => {
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
|
||||
let tx_clone = tx.clone();
|
||||
let _ = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf) {
|
||||
if let Err(err) = handle_socket(socket, conf, tx_clone) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
})
|
||||
@@ -44,12 +51,16 @@ fn get_tid() -> u64 {
|
||||
|
||||
/// This is run by `thread_main` above, inside a background thread.
|
||||
///
|
||||
fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
|
||||
fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
tx: UnboundedSender<CallmeEvent>,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("", tid = ?get_tid()).entered();
|
||||
|
||||
socket.set_nodelay(true)?;
|
||||
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(conf);
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(conf, tx);
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?;
|
||||
// libpq replication protocol between safekeeper and replicas/pagers
|
||||
pgbackend.run(&mut conn_handler)?;
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import subprocess
|
||||
import asyncio
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverApiException
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
|
||||
|
||||
#
|
||||
@@ -21,7 +23,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
# Override defaults, 1M gc_horizon and 4M checkpoint_distance.
|
||||
# Extend compaction_period and gc_period to disable background compaction and gc.
|
||||
tenant, _ = env.zenith_cli.create_tenant(
|
||||
tenant = env.zenith_cli.create_tenant(
|
||||
conf={
|
||||
'gc_period': '10 m',
|
||||
'gc_horizon': '1048576',
|
||||
@@ -35,6 +37,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder):
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
pscur.execute("failpoints flush-frozen=sleep(10000)")
|
||||
|
||||
env.zenith_cli.create_timeline(f'main', tenant_id=tenant)
|
||||
pg_branch0 = env.postgres.create_start('main', tenant_id=tenant)
|
||||
branch0_cur = pg_branch0.connect().cursor()
|
||||
branch0_cur.execute("SHOW zenith.zenith_timeline")
|
||||
@@ -117,17 +120,3 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
branch2_cur.execute('SELECT count(*) FROM foo')
|
||||
assert branch2_cur.fetchone() == (300000, )
|
||||
|
||||
|
||||
def test_ancestor_branch_detach(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
|
||||
parent_timeline_id = env.zenith_cli.create_branch("test_ancestor_branch_detach_parent", "empty")
|
||||
|
||||
env.zenith_cli.create_branch("test_ancestor_branch_detach_branch1",
|
||||
"test_ancestor_branch_detach_parent")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
with pytest.raises(ZenithPageserverApiException,
|
||||
match="Failed to detach inmem tenant timeline"):
|
||||
ps_http.timeline_detach(env.initial_tenant, parent_timeline_id)
|
||||
|
||||
@@ -19,8 +19,6 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
|
||||
#
|
||||
# See https://github.com/zenithdb/zenith/issues/1068
|
||||
zenith_env_builder.num_safekeepers = 1
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = zenith_env_builder.init_start()
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
import pytest
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.log_helper import log
|
||||
import os
|
||||
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder):
|
||||
# One safekeeper is enough for this test.
|
||||
zenith_env_builder.num_safekeepers = 3
|
||||
env = zenith_env_builder.init_start()
|
||||
|
||||
tenant_timelines = []
|
||||
|
||||
for n in range(4):
|
||||
tenant_id_uuid, timeline_id_uuid = env.zenith_cli.create_tenant()
|
||||
tenant_id = tenant_id_uuid.hex
|
||||
timeline_id = timeline_id_uuid.hex
|
||||
|
||||
pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid)
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
||||
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline_id = cur.fetchone()[0]
|
||||
pg.stop()
|
||||
tenant_timelines.append((tenant_id, timeline_id, pg))
|
||||
|
||||
# Stop the pageserver
|
||||
env.pageserver.stop()
|
||||
|
||||
# Leave the first timeline alone, but corrupt the others in different ways
|
||||
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
||||
|
||||
# Corrupt metadata file on timeline 1
|
||||
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
||||
metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1)
|
||||
print(f'overwriting metadata file at {metadata_path}')
|
||||
f = open(metadata_path, "w")
|
||||
f.write("overwritten with garbage!")
|
||||
f.close()
|
||||
|
||||
# Missing layer files file on timeline 2. (This would actually work
|
||||
# if we had Cloud Storage enabled in this test.)
|
||||
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
||||
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2)
|
||||
for filename in os.listdir(timeline_path):
|
||||
if filename.startswith('00000'):
|
||||
# Looks like a layer file. Remove it
|
||||
os.remove(f'{timeline_path}/{filename}')
|
||||
|
||||
# Corrupt layer files file on timeline 3
|
||||
(tenant3, timeline3, pg3) = tenant_timelines[3]
|
||||
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3)
|
||||
for filename in os.listdir(timeline_path):
|
||||
if filename.startswith('00000'):
|
||||
# Looks like a layer file. Corrupt it
|
||||
f = open(f'{timeline_path}/{filename}', "w")
|
||||
f.write("overwritten with garbage!")
|
||||
f.close()
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
# Tenant 0 should still work
|
||||
pg0.start()
|
||||
with closing(pg0.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT COUNT(*) FROM t")
|
||||
assert cur.fetchone()[0] == 100
|
||||
|
||||
# But all others are broken
|
||||
for n in range(1, 4):
|
||||
(tenant, timeline, pg) = tenant_timelines[n]
|
||||
with pytest.raises(Exception, match="Cannot load local timeline") as err:
|
||||
pg.start()
|
||||
log.info(f'compute startup failed as expected: {err}')
|
||||
@@ -32,16 +32,7 @@ def test_createdb(zenith_simple_env: ZenithEnv):
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
for db in (pg, pg2):
|
||||
with closing(db.connect(dbname='foodb')) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Check database size in both branches
|
||||
cur.execute(
|
||||
'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;',
|
||||
('foodb', ))
|
||||
res = cur.fetchone()
|
||||
# check that dbsize equals sum of all relation sizes, excluding shared ones
|
||||
# This is how we define dbsize in zenith for now
|
||||
assert res[0] == res[1]
|
||||
db.connect(dbname='foodb').close()
|
||||
|
||||
|
||||
#
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import random
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres
|
||||
from fixtures.zenith_fixtures import ZenithEnv, Postgres
|
||||
from fixtures.log_helper import log
|
||||
|
||||
# Test configuration
|
||||
@@ -50,12 +50,9 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str):
|
||||
#
|
||||
# (repro for https://github.com/zenithdb/zenith/issues/1047)
|
||||
#
|
||||
def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = zenith_env_builder.init_start()
|
||||
env.zenith_cli.create_branch("test_gc_aggressive", "main")
|
||||
def test_gc_aggressive(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli.create_branch("test_gc_aggressive", "empty")
|
||||
pg = env.postgres.create_start('test_gc_aggressive')
|
||||
log.info('postgres is running on test_gc_aggressive branch')
|
||||
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import print_gc_result
|
||||
import psycopg2.extras
|
||||
|
||||
|
||||
#
|
||||
@@ -14,11 +12,9 @@ import psycopg2.extras
|
||||
# just a hint that the page hasn't been modified since that LSN, and the page
|
||||
# server should return the latest page version regardless of the LSN.
|
||||
#
|
||||
def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder):
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = zenith_env_builder.init_start()
|
||||
env.zenith_cli.create_branch("test_old_request_lsn", "main")
|
||||
def test_old_request_lsn(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli.create_branch("test_old_request_lsn", "empty")
|
||||
pg = env.postgres.create_start('test_old_request_lsn')
|
||||
log.info('postgres is running on test_old_request_lsn branch')
|
||||
|
||||
@@ -30,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder):
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
psconn = env.pageserver.connect()
|
||||
pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor)
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers.
|
||||
@@ -57,9 +53,6 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder):
|
||||
# garbage collections so that the page server will remove old page versions.
|
||||
for i in range(10):
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
|
||||
for j in range(100):
|
||||
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')
|
||||
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
import subprocess
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import print_gc_result
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
|
||||
|
||||
#
|
||||
# Check pitr_interval GC behavior.
|
||||
# Insert some data, run GC and create a branch in the past.
|
||||
#
|
||||
def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
zenith_env_builder.num_safekeepers = 1
|
||||
# Set pitr interval such that we need to keep the data
|
||||
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
|
||||
|
||||
env = zenith_env_builder.init_start()
|
||||
pgmain = env.postgres.create_start('main')
|
||||
log.info("postgres is running on 'main' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
main_cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = main_cur.fetchone()[0]
|
||||
|
||||
# Create table
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
|
||||
for i in range(10000):
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space';
|
||||
''')
|
||||
|
||||
if i == 99:
|
||||
# keep some early lsn to test branch creation after GC
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()')
|
||||
res = main_cur.fetchone()
|
||||
lsn_a = res[0]
|
||||
xid_a = res[1]
|
||||
log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()')
|
||||
res = main_cur.fetchone()
|
||||
debug_lsn = res[0]
|
||||
debug_xid = res[1]
|
||||
log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}')
|
||||
|
||||
# run GC
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
pscur.execute(f"compact {env.initial_tenant.hex} {timeline}")
|
||||
# perform agressive GC. Data still should be kept because of the PITR setting.
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
# It must have been preserved by PITR setting
|
||||
env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a)
|
||||
|
||||
pg_hundred = env.postgres.create_start('test_pitr_gc_hundred')
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = pg_hundred.connect()
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (10000, )
|
||||
@@ -16,7 +16,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}'''
|
||||
|
||||
env = zenith_env_builder.init_start()
|
||||
"""Test per tenant configuration"""
|
||||
tenant, _ = env.zenith_cli.create_tenant(conf={
|
||||
tenant = env.zenith_cli.create_tenant(conf={
|
||||
'checkpoint_distance': '20000',
|
||||
'gc_period': '30sec',
|
||||
})
|
||||
|
||||
@@ -95,10 +95,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
|
||||
log.info('load thread stopped')
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"needs to replace callmemaybe call with better idea how to migrate timelines between pageservers"
|
||||
)
|
||||
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
|
||||
def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
|
||||
port_distributor: PortDistributor,
|
||||
@@ -111,13 +107,12 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
|
||||
# create folder for remote storage mock
|
||||
remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage'
|
||||
|
||||
tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
|
||||
tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
|
||||
log.info("tenant to relocate %s", tenant)
|
||||
env.zenith_cli.create_root_branch('main', tenant_id=tenant)
|
||||
env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant)
|
||||
|
||||
# attach does not download ancestor branches (should it?), just use root branch for now
|
||||
env.zenith_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant)
|
||||
|
||||
tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation',
|
||||
tenant_pg = env.postgres.create_start(branch_name='main',
|
||||
node_name='test_tenant_relocation',
|
||||
tenant_id=tenant)
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep
|
||||
|
||||
env = zenith_env_builder.init_start()
|
||||
"""Tests tenants with and without wal acceptors"""
|
||||
tenant_1, _ = env.zenith_cli.create_tenant()
|
||||
tenant_2, _ = env.zenith_cli.create_tenant()
|
||||
tenant_1 = env.zenith_cli.create_tenant()
|
||||
tenant_2 = env.zenith_cli.create_tenant()
|
||||
|
||||
env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}',
|
||||
tenant_id=tenant_1)
|
||||
|
||||
@@ -573,9 +573,7 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
|
||||
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
||||
|
||||
# fetch something sensible from status
|
||||
tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
epoch = tli_status.acceptor_epoch
|
||||
timeline_start_lsn = tli_status.timeline_start_lsn
|
||||
epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
||||
|
||||
pg.safe_psql("create table t(i int)")
|
||||
|
||||
@@ -583,13 +581,9 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
|
||||
pg.stop().start()
|
||||
pg.safe_psql("insert into t values(10)")
|
||||
|
||||
tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
epoch_after_reboot = tli_status.acceptor_epoch
|
||||
epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
||||
assert epoch_after_reboot > epoch
|
||||
|
||||
# and timeline_start_lsn stays the same
|
||||
assert tli_status.timeline_start_lsn == timeline_start_lsn
|
||||
|
||||
|
||||
class SafekeeperEnv:
|
||||
def __init__(self,
|
||||
@@ -850,116 +844,3 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder):
|
||||
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert wal_size_after_checkpoint < 16 * 2.5
|
||||
|
||||
|
||||
def test_delete_force(zenith_env_builder: ZenithEnvBuilder):
|
||||
zenith_env_builder.num_safekeepers = 1
|
||||
env = zenith_env_builder.init_start()
|
||||
|
||||
# Create two tenants: one will be deleted, other should be preserved.
|
||||
tenant_id = env.initial_tenant.hex
|
||||
timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly
|
||||
timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly
|
||||
timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant
|
||||
timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant
|
||||
|
||||
tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant()
|
||||
tenant_id_other = tenant_id_other_uuid.hex
|
||||
timeline_id_other = timeline_id_other_uuid.hex
|
||||
|
||||
# Populate branches
|
||||
pg_1 = env.postgres.create_start('br1')
|
||||
pg_2 = env.postgres.create_start('br2')
|
||||
pg_3 = env.postgres.create_start('br3')
|
||||
pg_4 = env.postgres.create_start('br4')
|
||||
pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other))
|
||||
for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('CREATE TABLE t(key int primary key)')
|
||||
sk = env.safekeepers[0]
|
||||
sk_data_dir = Path(sk.data_dir())
|
||||
sk_http = sk.http_client()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_1).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
|
||||
pg_2.stop_and_destroy()
|
||||
pg_4.stop_and_destroy()
|
||||
sk.stop()
|
||||
sk.start()
|
||||
|
||||
# Ensure connections to Safekeeper are established
|
||||
for pg in [pg_1, pg_3, pg_other]:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('INSERT INTO t (key) VALUES (1)')
|
||||
|
||||
# Remove initial tenant's br1 (active)
|
||||
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
|
||||
"dir_existed": True,
|
||||
"was_active": True,
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Ensure repeated deletion succeeds
|
||||
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
|
||||
"dir_existed": False, "was_active": False
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Remove initial tenant's br2 (inactive)
|
||||
assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == {
|
||||
"dir_existed": True,
|
||||
"was_active": False,
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_2).exists()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Remove non-existing branch, should succeed
|
||||
assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == {
|
||||
"dir_existed": False,
|
||||
"was_active": False,
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
|
||||
assert not (sk_data_dir / tenant_id / timeline_id_2).exists()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_3).exists()
|
||||
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Remove initial tenant fully (two branches are active)
|
||||
response = sk_http.tenant_delete_force(tenant_id)
|
||||
assert response == {
|
||||
timeline_id_3: {
|
||||
"dir_existed": True,
|
||||
"was_active": True,
|
||||
}
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id).exists()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Remove initial tenant again.
|
||||
response = sk_http.tenant_delete_force(tenant_id)
|
||||
assert response == {}
|
||||
assert not (sk_data_dir / tenant_id).exists()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
# Ensure the other tenant still works
|
||||
sk_http.timeline_status(tenant_id_other, timeline_id_other)
|
||||
with closing(pg_other.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('INSERT INTO t (key) VALUES (123)')
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import uuid
|
||||
import requests
|
||||
|
||||
from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
|
||||
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
|
||||
from typing import cast
|
||||
|
||||
|
||||
@@ -64,13 +64,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
|
||||
helper_compare_tenant_list(pageserver_http_client, env)
|
||||
|
||||
# Create new tenant
|
||||
tenant1, _ = env.zenith_cli.create_tenant()
|
||||
tenant1 = env.zenith_cli.create_tenant()
|
||||
|
||||
# check tenant1 appeared
|
||||
helper_compare_tenant_list(pageserver_http_client, env)
|
||||
|
||||
# Create new tenant
|
||||
tenant2, _ = env.zenith_cli.create_tenant()
|
||||
tenant2 = env.zenith_cli.create_tenant()
|
||||
|
||||
# check tenant2 appeared
|
||||
helper_compare_tenant_list(pageserver_http_client, env)
|
||||
@@ -83,16 +83,6 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
|
||||
assert tenant2.hex in tenants
|
||||
|
||||
|
||||
def test_cli_tenant_create(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
tenant_id, _ = env.zenith_cli.create_tenant()
|
||||
timelines = env.zenith_cli.list_timelines(tenant_id)
|
||||
|
||||
# an initial timeline should be created upon tenant creation
|
||||
assert len(timelines) == 1
|
||||
assert timelines[0][0] == DEFAULT_BRANCH_NAME
|
||||
|
||||
|
||||
def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder):
|
||||
# Start with single sk
|
||||
zenith_env_builder.num_safekeepers = 1
|
||||
|
||||
@@ -106,9 +106,9 @@ class ZenithCompare(PgCompare):
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
total_files = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_created_persistent_files_total")
|
||||
self.env.pageserver, "pageserver_num_persistent_files_created")
|
||||
total_bytes = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_written_persistent_bytes_total")
|
||||
self.env.pageserver, "pageserver_persistent_bytes_written")
|
||||
self.zenbenchmark.record("data_uploaded",
|
||||
total_bytes / (1024 * 1024),
|
||||
"MB",
|
||||
@@ -130,10 +130,7 @@ class VanillaCompare(PgCompare):
|
||||
def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres):
|
||||
self._pg = vanilla_pg
|
||||
self._zenbenchmark = zenbenchmark
|
||||
vanilla_pg.configure([
|
||||
'shared_buffers=1MB',
|
||||
'synchronous_commit=off',
|
||||
])
|
||||
vanilla_pg.configure(['shared_buffers=1MB'])
|
||||
vanilla_pg.start()
|
||||
|
||||
# Long-lived cursor, useful for flushing
|
||||
|
||||
@@ -75,8 +75,7 @@ def lsn_from_hex(lsn_hex: str) -> int:
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}"
|
||||
" needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
|
||||
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
|
||||
.format_map(row))
|
||||
|
||||
|
||||
|
||||
@@ -472,16 +472,20 @@ class ZenithEnvBuilder:
|
||||
|
||||
mock_endpoint = self.s3_mock_server.endpoint()
|
||||
mock_region = self.s3_mock_server.region()
|
||||
mock_access_key = self.s3_mock_server.access_key()
|
||||
mock_secret_key = self.s3_mock_server.secret_key()
|
||||
boto3.client(
|
||||
's3',
|
||||
endpoint_url=mock_endpoint,
|
||||
region_name=mock_region,
|
||||
aws_access_key_id=self.s3_mock_server.access_key(),
|
||||
aws_secret_access_key=self.s3_mock_server.secret_key(),
|
||||
aws_access_key_id=mock_access_key,
|
||||
aws_secret_access_key=mock_secret_key,
|
||||
).create_bucket(Bucket=bucket_name)
|
||||
self.pageserver_remote_storage = S3Storage(bucket=bucket_name,
|
||||
endpoint=mock_endpoint,
|
||||
region=mock_region)
|
||||
region=mock_region,
|
||||
access_key=mock_access_key,
|
||||
secret_key=mock_secret_key)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@@ -807,6 +811,8 @@ class LocalFsStorage:
|
||||
class S3Storage:
|
||||
bucket: str
|
||||
region: str
|
||||
access_key: Optional[str]
|
||||
secret_key: Optional[str]
|
||||
endpoint: Optional[str]
|
||||
|
||||
|
||||
@@ -831,25 +837,20 @@ class ZenithCli:
|
||||
|
||||
def create_tenant(self,
|
||||
tenant_id: Optional[uuid.UUID] = None,
|
||||
timeline_id: Optional[uuid.UUID] = None,
|
||||
conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]:
|
||||
conf: Optional[Dict[str, str]] = None) -> uuid.UUID:
|
||||
"""
|
||||
Creates a new tenant, returns its id and its initial timeline's id.
|
||||
"""
|
||||
if tenant_id is None:
|
||||
tenant_id = uuid.uuid4()
|
||||
if timeline_id is None:
|
||||
timeline_id = uuid.uuid4()
|
||||
if conf is None:
|
||||
res = self.raw_cli([
|
||||
'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex
|
||||
])
|
||||
res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex])
|
||||
else:
|
||||
res = self.raw_cli([
|
||||
'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex
|
||||
] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), []))
|
||||
res = self.raw_cli(
|
||||
['tenant', 'create', '--tenant-id', tenant_id.hex] +
|
||||
sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), []))
|
||||
res.check_returncode()
|
||||
return tenant_id, timeline_id
|
||||
return tenant_id
|
||||
|
||||
def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]):
|
||||
"""
|
||||
@@ -997,14 +998,7 @@ class ZenithCli:
|
||||
append_pageserver_param_overrides(start_args,
|
||||
self.env.pageserver.remote_storage,
|
||||
self.env.pageserver.config_override)
|
||||
|
||||
s3_env_vars = None
|
||||
if self.env.s3_mock_server:
|
||||
s3_env_vars = {
|
||||
'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(),
|
||||
'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(),
|
||||
}
|
||||
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
|
||||
return self.raw_cli(start_args)
|
||||
|
||||
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
|
||||
cmd = ['pageserver', 'stop']
|
||||
@@ -1099,7 +1093,6 @@ class ZenithCli:
|
||||
|
||||
def raw_cli(self,
|
||||
arguments: List[str],
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
check_return_code=True) -> 'subprocess.CompletedProcess[str]':
|
||||
"""
|
||||
Run "zenith" with the specified arguments.
|
||||
@@ -1115,7 +1108,7 @@ class ZenithCli:
|
||||
|
||||
assert type(arguments) == list
|
||||
|
||||
bin_zenith = os.path.join(str(zenith_binpath), 'neon_local')
|
||||
bin_zenith = os.path.join(str(zenith_binpath), 'zenith')
|
||||
|
||||
args = [bin_zenith] + arguments
|
||||
log.info('Running command "{}"'.format(' '.join(args)))
|
||||
@@ -1124,10 +1117,9 @@ class ZenithCli:
|
||||
env_vars = os.environ.copy()
|
||||
env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir)
|
||||
env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir)
|
||||
|
||||
if self.env.rust_log_override is not None:
|
||||
env_vars['RUST_LOG'] = self.env.rust_log_override
|
||||
for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items():
|
||||
env_vars[extra_env_key] = extra_env_value
|
||||
|
||||
# Pass coverage settings
|
||||
var = 'LLVM_PROFILE_FILE'
|
||||
@@ -1225,6 +1217,10 @@ def append_pageserver_param_overrides(
|
||||
pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\
|
||||
bucket_region='{pageserver_remote_storage.region}'"
|
||||
|
||||
if pageserver_remote_storage.access_key is not None:
|
||||
pageserver_storage_override += f",access_key_id='{pageserver_remote_storage.access_key}'"
|
||||
if pageserver_remote_storage.secret_key is not None:
|
||||
pageserver_storage_override += f",secret_access_key='{pageserver_remote_storage.secret_key}'"
|
||||
if pageserver_remote_storage.endpoint is not None:
|
||||
pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'"
|
||||
|
||||
@@ -1307,6 +1303,22 @@ def pg_bin(test_output_dir: str) -> PgBin:
|
||||
return PgBin(test_output_dir)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReplayBin:
|
||||
"""A helper class for running the pageserver benchmarker tool."""
|
||||
def run(self, tenant, timeline):
|
||||
replay_binpath = os.path.join(str(zenith_binpath), 'replay')
|
||||
args = [replay_binpath,
|
||||
"--tenant", tenant.hex,
|
||||
"--timeline", timeline.hex]
|
||||
return subprocess.run(args)
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def replay_bin(test_output_dir):
|
||||
return ReplayBin()
|
||||
|
||||
|
||||
class VanillaPostgres(PgProtocol):
|
||||
def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int):
|
||||
super().__init__(host='localhost', port=port, dbname='postgres')
|
||||
@@ -1319,7 +1331,7 @@ class VanillaPostgres(PgProtocol):
|
||||
"""Append lines into postgresql.conf file."""
|
||||
assert not self.running
|
||||
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
|
||||
conf_file.write("\n".join(options))
|
||||
conf_file.writelines(options)
|
||||
|
||||
def start(self, log_path: Optional[str] = None):
|
||||
assert not self.running
|
||||
@@ -1766,7 +1778,6 @@ class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
flush_lsn: str
|
||||
remote_consistent_lsn: str
|
||||
timeline_start_lsn: str
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1791,8 +1802,7 @@ class SafekeeperHttpClient(requests.Session):
|
||||
resj = res.json()
|
||||
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
||||
flush_lsn=resj['flush_lsn'],
|
||||
remote_consistent_lsn=resj['remote_consistent_lsn'],
|
||||
timeline_start_lsn=resj['timeline_start_lsn'])
|
||||
remote_consistent_lsn=resj['remote_consistent_lsn'])
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body):
|
||||
res = self.post(
|
||||
@@ -1800,21 +1810,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]:
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
# What performance tests do we have and how we run them
|
||||
|
||||
Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison.
|
||||
|
||||
## Tests that are run against local installation
|
||||
|
||||
Most of the performance tests run against a local installation. This is not very representative of a production environment. Firstly, Postgres, safekeeper(s) and the pageserver have to share CPU and I/O resources, which can add noise to the results. Secondly, network overhead is eliminated.
|
||||
|
||||
In the CI, the performance tests are run in the same environment as the other integration tests. We don't have control over the host that the CI runs on, so the environment may vary widely from one run to another, which makes the results across different runs noisy to compare.
|
||||
|
||||
## Remote tests
|
||||
|
||||
There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters.
|
||||
|
||||
## Noise
|
||||
|
||||
All tests run only once. Usually to obtain more consistent performance numbers, a test should be repeated multiple times and the results be aggregated, for example by taking min, max, avg, or median.
|
||||
|
||||
## Results collection
|
||||
|
||||
Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
|
||||
|
||||
There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user